/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data;
import com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository;
import com.cloudera.cdk.data.spi.Loadable;
import com.cloudera.cdk.data.spi.OptionBuilder;
import com.cloudera.cdk.data.spi.URIPattern;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.util.Map;
import java.util.ServiceLoader;
/**
* <p>Convenience methods for working with {@link DatasetRepository} instances.</p>
*
* @since 0.8.0
*/
public class DatasetRepositories {
private static final Logger logger = LoggerFactory.getLogger(DatasetRepositories.class);
private static final URIPattern BASE_PATTERN = new URIPattern(
URI.create("repo:*storage-uri"));
private static final Map<URIPattern, OptionBuilder<DatasetRepository>>
REGISTRY = Maps.newLinkedHashMap();
/**
* Registers a {@link URIPattern} and an {@link OptionBuilder} to create
* instances of DatasetRepository from the pattern's match options.
*
* @param pattern a URIPattern
* @param builder an OptionBuilder that expects options defined by
* {@code pattern} and builds DatasetRepository instances.
*/
static void register(
URIPattern pattern, OptionBuilder<DatasetRepository> builder) {
REGISTRY.put(pattern, builder);
}
static {
// load implementations, which will register themselves
ServiceLoader<Loadable> impls =
ServiceLoader.load(Loadable.class);
for (Loadable loader : impls) {
// the ServiceLoader is lazy, so this iteration forces service loading
logger.debug("Loading: " + loader.getClass().getName());
loader.load();
}
logger.debug(
"Registered repository URIs: " +
Joiner.on(", ").join(REGISTRY.keySet()));
}
/**
* Synonym for {@link #open(java.net.URI)} for String URIs.
*
* @param uri a String URI
* @return a DatasetRepository for the given URI.
* @throws IllegalArgumentException If the String cannot be parsed into a
* valid {@link java.net.URI}.
*/
public static DatasetRepository open(String uri) {
// uses of URI.create throw IllegalArgumentException if the URI is invalid
return open(URI.create(uri));
}
/**
* <p>
* Open a {@link DatasetRepository} for the given URI.
* </p>
* <p>
* This method provides a simpler way to connect to a {@link DatasetRepository}
* while providing information about the appropriate {@link MetadataProvider}
* and other options to use. For almost all cases, this is the preferred method
* of retrieving an instance of a {@link DatasetRepository}.
* </p>
* <p>
* The format of a repository URI is as follows.
* </p>
* <code>repo:[storage component]</code>
* <p>
* The <code>[storage component]</code> indicates the underlying metadata and,
* in some cases, physical storage of the data, along with any options. The
* supported storage backends are:
* </p>
* <h1>Local FileSystem URIs</h1>
* <p>
* <code>file:[path]</code> where <code>[path]</code> is a relative or absolute
* filesystem path to be used as the dataset repository root directory in which
* to store dataset data. When specifying an absolute path, the
* <q>null authority</q> (i.e. <code>file:///my/path</code>)
* form may be used. Alternatively, the authority section may be omitted
* entirely (e.g. <code>file:/my/path</code>). Either way, it is illegal to
* provide an authority (i.e.
* <code>file://this-part-is-illegal/my/path</code>). This storage backend
* will produce a {@link DatasetRepository} that stores both data and metadata
* on the local operating system filesystem. See
* {@link FileSystemDatasetRepository} for more information.
* </p>
* <h1>HDFS FileSystem URIs</h1>
* <p>
* <code>hdfs://[host]:[port]/[path]</code> where <code>[host]</code> and
* <code>[port]</code> indicate the location of the Hadoop NameNode, and
* <code>[path]</code> is the dataset repository root directory in which to
* store dataset data. This form will load the Hadoop configuration
* information per the usual methods (i.e. searching the process's classpath
* for the various configuration files). This storage backend will produce a
* {@link DatasetRepository} that stores both data and metadata in HDFS. See
* {@link FileSystemDatasetRepository} for more information.
* </p>
* <h1>Hive/HCatalog URIs</h1>
* <p>
* <code>hive</code> and
* <code>hive://[metastore-host]:[metastore-port]/</code> will connect to the
* Hive MetaStore. Dataset locations will be determined by Hive as managed
* tables.
* </p>
* <p>
* <code>hive:/[path]</code> and
* <code>hive://[metastore-host]:[metastore-port]/[path]</code> will also
* connect to the Hive MetaStore, but tables will be external and stored
* under <code>[path]</code>. The repository storage layout will be the same
* as <code>hdfs</code> and <code>file</code> repositories. HDFS connection
* options can be supplied by adding <code>hdfs-host</code> and
* <code>hdfs-port</code> query options to the URI (see examples).
* </p>
* <h1>HBase URIs</h1>
* <p>
* <code>repo:hbase:[zookeeper-host1]:[zk-port],[zookeeper-host2],...
* </code> will open a HBase-backed DatasetRepository. This URI may also be
* instantiated with {@link #openRandomAccess(URI)} to instantiate a {@link
* RandomAccessDatasetRepository}
* </p>
* <h1>Examples</h1>
* <p>
* <table>
* <tr>
* <td><code>repo:file:foo/bar</code></td>
* <td>Store data+metadata on the local filesystem in the directory
* <code>./foo/bar</code>.</td>
* </tr>
* <tr>
* <td><code>repo:file:///data</code></td>
* <td>Store data+metadata on the local filesystem in the directory
* <code>/data</code></td>
* </tr>
* <tr>
* <td><code>repo:hdfs://localhost:8020/data</code></td>
* <td>Same as above, but stores data+metadata on HDFS.</td>
* </tr>
* <tr>
* <td><code>repo:hive</code></td>
* <td>Connects to the Hive MetaStore and creates managed tables.</td>
* </tr>
* <tr>
* <td><code>repo:hive://meta-host:9083/</code></td>
* <td>Connects to the Hive MetaStore at <code>thrift://meta-host:9083</code>,
* and creates managed tables. This only matches when the path is
* <code>/</code></td>. Any non-root path will match the external Hive URIs.
* </tr>
* <tr>
* <td><code>repo:hive:/path?hdfs-host=localhost&hdfs-port=8020</code></td>
* <td>Connects to the default Hive MetaStore and creates external tables
* stored in <code>hdfs://localhost:8020/</code> at <code>path</code>.
* <code>hdfs-host</code> and <code>hdfs-port</code> are optional.
* </td>
* </tr>
* <tr>
* <td>
* <code>repo:hive://meta-host:9083/path?hdfs-host=localhost&hdfs-port=8020
* </code>
* </td>
* <td>
* Connects to the Hive MetaStore at <code>thrift://meta-host:9083/</code>
* and creates external tables stored in <code>hdfs://localhost:8020/</code>
* at <code>path</code>. <code>hdfs-host</code> and <code>hdfs-port</code>
* are optional.
* </td>
* </tr>
* <tr>
* <td>
* <code>repo:hbase:zk1,zk2,zk3</code>
* </td>
* <td>
* Connects to HBase via the given zookeeper quorum nodes.
* </td>
* </tr>
* </table>
* </p>
*
* @param repositoryUri The repository URI
* @return An appropriate implementation of {@link DatasetRepository}
* @since 0.8.0
*/
public static DatasetRepository open(URI repositoryUri) {
final Map<String, String> baseMatch = BASE_PATTERN.getMatch(repositoryUri);
Preconditions.checkArgument(baseMatch != null,
"Invalid dataset repository URI:%s - scheme must be `repo:`",
repositoryUri);
final URI storage = URI.create(baseMatch.get("storage-uri"));
Map<String, String> match;
for (URIPattern pattern : REGISTRY.keySet()) {
match = pattern.getMatch(storage);
if (match != null) {
final OptionBuilder<DatasetRepository> builder = REGISTRY.get(pattern);
final DatasetRepository repo = builder.getFromOptions(match);
logger.debug(
"Connected to repository:{} using uri:{}", repo, repositoryUri);
return repo;
}
}
throw new IllegalArgumentException("Unknown storage URI:" + storage);
}
/**
* Synonym for {@link #openRandomAccess(java.net.URI)} for String URIs.
*
* @param uri a String URI
* @return An appropriate implementation of {@link RandomAccessDatasetRepository}
* @throws IllegalArgumentException If the String cannot be parsed into a
* valid {@link java.net.URI}.
* @since 0.9.0
*/
public static RandomAccessDatasetRepository openRandomAccess(String uri) {
return openRandomAccess(URI.create(uri));
}
/**
* <p>
* Synonym for {@link #open(java.net.URI)} for {@link RandomAccessDatasetRepository}s
* </p>
* <p>
* This method provides a simpler way to connect to a {@link DatasetRepository} the same
* way {@link #open(java.net.URI)} does, but instead returns an implementation of type
* {@link RandomAccessDatasetRepository}. This method should be used when one needs to
* access {@link RandomAccessDataset}s to take advantage of the random access methods.
* </p>
* </>
* The format of a repository URI is as follows.
* </p>
* <code>repo:[storage component]</code>
* <p>
* The <code>[storage component]</code> indicates the underlying metadata and,
* in some cases, physical storage of the data, along with any options. The
* supported storage backends are:
* </p>
* <h1>HBase URIs</h1>
* <p>
* <code>repo:hbase:[zookeeper-host1]:[zk-port],[zookeeper-host2],...
* </code> will open a HBase-backed DatasetRepository. This URI may also be
* instantiated with {@link #openRandomAccess(URI)} to instantiate a {@link
* RandomAccessDatasetRepository}
* </p>
* <h1>Examples</h1>
* <p>
* <table>
* </tr>
* <tr>
* <td>
* <code>repo:hbase:zk1,zk2,zk3
* </code>
* </td>
* <td>
* Connects to HBase via the given zookeeper quorum nodes.
* </td>
* </tr>
* </table>
*
* @param repositoryUri The repository URI
* @return An appropriate implementation of {@link RandomAccessDatasetRepository}
* @since 0.9.0
*/
public static RandomAccessDatasetRepository openRandomAccess(URI repositoryUri) {
return (RandomAccessDatasetRepository)open(repositoryUri);
}
}