/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.config.store.hdfs;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import gobblin.config.common.impl.SingleLinkedListConfigKeyPath;
import gobblin.config.store.api.ConfigKeyPath;
import gobblin.config.store.api.ConfigStore;
import gobblin.config.store.api.ConfigStoreWithStableVersioning;
import gobblin.config.store.api.VersionDoesNotExistException;
import gobblin.config.store.deploy.ConfigStream;
import gobblin.config.store.deploy.Deployable;
import gobblin.config.store.deploy.DeployableConfigSource;
import gobblin.config.store.deploy.FsDeploymentConfig;
import gobblin.util.FileListUtils;
import gobblin.util.PathUtils;
import gobblin.util.io.SeekableFSInputStream;
import gobblin.util.io.StreamUtils;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
/**
* An implementation of {@link ConfigStore} backed by HDFS. The class assumes a simple file and directory layout
* structure where each path under the root store directory corresponds to a dataset. The {@link #getStoreURI()} method
* gives an {@link URI} that identifies the HDFS cluster being used, as well as the root directory of the store. When
* querying this store, the scheme should be of the form {@code simple-[hdfs-scheme]} (a.k.a the logical scheme). For
* example, if the store is located on a the local filesystem the scheme should be {@code simple-file}, if the store
* is located on HDFS, the scheme should be {@code simple-hdfs}. This class can be constructed using a
* {@link SimpleHDFSConfigStoreFactory}.
*
* <p>
* The class assumes a directory called {@link #CONFIG_STORE_NAME} is under the root directory. This folder should
* contain a directory for each version deployed to the {@link ConfigStore}. An example directory structure could look
* like: <br>
* <blockquote>
* <code>
* /root<br>
* /my-simple-store<br>
* /_CONFIG_STORE<br>
* /v1.0<br>
* /dataset1<br>
* /child-dataset<br>
* /main.conf<br>
* /includes.conf<br>
* /dataset2<br>
* /main.conf<br>
* /child-dataset<br>
* /main.conf<br>
* </code>
* </blockquote>
* </p>
*
* <p>
* In the above example, the root of the store is {@code /root/my-simple-store/}. The code automatically assumes that
* this folder contains a directory named {@link #CONFIG_STORE_NAME}. In order to access the dataset
* {@code dataset1/child-dataset} using ConfigClient#getConfig(URI), the specified {@link URI} should be
* {@code simple-hdfs://[authority]:[port]/root/my-simple-store/dataset1/child-dataset/}. Note this is the fully
* qualified path to the actual {@link #MAIN_CONF_FILE_NAME} file on HDFS, with the {@link #CONFIG_STORE_NAME} and the
* {@code version} directories removed.
* </p>
*
* <p>
* All the {@link Config}s for a dataset should be put in the associated {@link #MAIN_CONF_FILE_NAME} file, and all
* the imports should be put in the associated {@link #INCLUDES_CONF_FILE_NAME} file.
* </p>
*
* <p>
* This class is not responsible for deploying configurations from an external source to HDFS, only for reading them.
* </p>
*
* @see SimpleHDFSConfigStoreFactory
*/
@Slf4j
@ConfigStoreWithStableVersioning
public class SimpleHDFSConfigStore implements ConfigStore, Deployable<FsDeploymentConfig> {
protected static final String CONFIG_STORE_NAME = "_CONFIG_STORE";
private static final String MAIN_CONF_FILE_NAME = "main.conf";
private static final String INCLUDES_CONF_FILE_NAME = "includes.conf";
private static final String INCLUDES_KEY_NAME = "includes";
private final FileSystem fs;
private final URI physicalStoreRoot;
private final URI logicalStoreRoot;
private final Cache<String, Path> versions;
private final SimpleHDFSStoreMetadata storeMetadata;
/**
* Constructs a {@link SimpleHDFSConfigStore} using a given {@link FileSystem} and a {@link URI} that points to the
* physical location of the store root.
*
* @param fs the {@link FileSystem} the {@link ConfigStore} is stored on.
* @param physicalStoreRoot the fully qualified {@link URI} of the physical store root, the {@link URI#getScheme()} of the
* {@link URI} should match the {@link FileSystem#getScheme()} of the given {@link FileSystem}.
* @param logicalStoreRoot the fully qualfied {@link URI} of the logical store root
*/
protected SimpleHDFSConfigStore(FileSystem fs, URI physicalStoreRoot, URI logicalStoreRoot) {
Preconditions.checkNotNull(fs, "fs cannot be null!");
Preconditions.checkNotNull(physicalStoreRoot, "physicalStoreRoot cannot be null!");
Preconditions.checkNotNull(logicalStoreRoot, "logicalStoreRoot cannot be null!");
this.fs = fs;
Preconditions.checkArgument(!Strings.isNullOrEmpty(physicalStoreRoot.getScheme()),
"The physicalStoreRoot must have a valid scheme!");
Preconditions.checkArgument(physicalStoreRoot.getScheme().equals(fs.getUri().getScheme()),
"The scheme of the physicalStoreRoot and the filesystem must match!");
Preconditions.checkArgument(!Strings.isNullOrEmpty(physicalStoreRoot.getPath()),
"The path of the physicalStoreRoot must be valid as it is the root of the store!");
this.physicalStoreRoot = physicalStoreRoot;
this.logicalStoreRoot = logicalStoreRoot;
this.versions = CacheBuilder.newBuilder().build();
this.storeMetadata = new SimpleHDFSStoreMetadata(fs, new Path(new Path(this.physicalStoreRoot), CONFIG_STORE_NAME));
}
/**
* Returns a {@link String} representation of the active version stored in the {@link ConfigStore}. This method
* determines the current active version by reading the {@link #CONFIG_STORE_METADATA_FILENAME} in
* {@link #CONFIG_STORE_NAME}
*
* @return a {@link String} representing the current active version of the {@link ConfigStore}.
*/
@Override
public String getCurrentVersion() {
try {
return this.storeMetadata.getCurrentVersion();
} catch (IOException e) {
Path configStoreDir = new Path(new Path(this.physicalStoreRoot), CONFIG_STORE_NAME);
throw new RuntimeException(
String.format("Error while checking current version for configStoreDir: \"%s\"", configStoreDir), e);
}
}
/**
* Returns a {@link URI} representing the logical store {@link URI} where the {@link URI#getPath()} is the path to
* the root of the {@link ConfigStore}.
*
* @return a {@link URI} representing the logical store {@link URI} (e.g. simple-hdfs://[authority]:[port][path-to-root]).
*/
@Override
public URI getStoreURI() {
return this.logicalStoreRoot;
}
/**
* Retrieves all the children of the given {@link ConfigKeyPath} by doing a {@code ls} on the {@link Path} specified
* by the {@link ConfigKeyPath}. If the {@link Path} described by the {@link ConfigKeyPath} does not exist, an empty
* {@link Collection} is returned.
*
* @param configKey the config key path whose children are necessary.
* @param version specify the configuration version in the configuration store.
*
* @return a {@link Collection} of {@link ConfigKeyPath} where each entry is a child of the given configKey.
*
* @throws VersionDoesNotExistException if the version specified cannot be found in the {@link ConfigStore}.
*/
@Override
public Collection<ConfigKeyPath> getChildren(ConfigKeyPath configKey, String version)
throws VersionDoesNotExistException {
Preconditions.checkNotNull(configKey, "configKey cannot be null!");
Preconditions.checkArgument(!Strings.isNullOrEmpty(version), "version cannot be null or empty!");
List<ConfigKeyPath> children = new ArrayList<>();
Path datasetDir = getDatasetDirForKey(configKey, version);
try {
if (!this.fs.exists(datasetDir)) {
return children;
}
for (FileStatus fileStatus : this.fs.listStatus(datasetDir)) {
if (fileStatus.isDirectory()) {
children.add(configKey.createChild(fileStatus.getPath().getName()));
}
}
return children;
} catch (IOException e) {
throw new RuntimeException(String.format("Error while getting children for configKey: \"%s\"", configKey), e);
}
}
/**
* Retrieves all the {@link ConfigKeyPath}s that are imported by the given {@link ConfigKeyPath}. This method does this
* by reading the {@link #INCLUDES_CONF_FILE_NAME} file associated with the dataset specified by the given
* {@link ConfigKeyPath}. If the {@link Path} described by the {@link ConfigKeyPath} does not exist, then an empty
* {@link List} is returned.
*
* @param configKey the config key path whose tags are needed
* @param version the configuration version in the configuration store.
*
* @return a {@link List} of {@link ConfigKeyPath}s where each entry is a {@link ConfigKeyPath} imported by the dataset
* specified by the configKey.
*
* @throws VersionDoesNotExistException if the version specified cannot be found in the {@link ConfigStore}.
*/
@Override
public List<ConfigKeyPath> getOwnImports(ConfigKeyPath configKey, String version)
throws VersionDoesNotExistException {
Preconditions.checkNotNull(configKey, "configKey cannot be null!");
Preconditions.checkArgument(!Strings.isNullOrEmpty(version), "version cannot be null or empty!");
List<ConfigKeyPath> configKeyPaths = new ArrayList<>();
Path datasetDir = getDatasetDirForKey(configKey, version);
Path includesFile = new Path(datasetDir, INCLUDES_CONF_FILE_NAME);
try {
if (!this.fs.exists(includesFile)) {
return configKeyPaths;
}
FileStatus includesFileStatus = this.fs.getFileStatus(includesFile);
if (!includesFileStatus.isDirectory()) {
try (InputStream includesConfInStream = this.fs.open(includesFileStatus.getPath())) {
/*
* The includes returned are used to build a fallback chain.
* With the natural order, if a key found in the first include it is not be overriden by the next include.
* By reversing the list, the Typesafe fallbacks are constructed bottom up.
*/
configKeyPaths.addAll(Lists.newArrayList(
Iterables.transform(Lists.reverse(resolveIncludesList(IOUtils.readLines(includesConfInStream, Charsets.UTF_8))),
new IncludesToConfigKey())));
}
}
} catch (IOException e) {
throw new RuntimeException(String.format("Error while getting config for configKey: \"%s\"", configKey), e);
}
return configKeyPaths;
}
/**
* A helper to resolve System properties and Environment variables in includes paths
* The method loads the list of unresolved <code>includes</code> into an in-memory {@link Config} object and reolves
* with a fallback on {@link ConfigFactory#defaultOverrides()}
*
* @param includes list of unresolved includes
* @return a list of resolved includes
*/
@VisibleForTesting
public static List<String> resolveIncludesList(List<String> includes) {
// Create a TypeSafe Config object with Key INCLUDES_KEY_NAME and value an array of includes
StringBuilder includesBuilder = new StringBuilder();
for (String include : includes) {
// Skip comments
if (StringUtils.isNotBlank(include) && !StringUtils.startsWith(include, "#")) {
includesBuilder.append(INCLUDES_KEY_NAME).append("+=").append(include).append("\n");
}
}
// Resolve defaultOverrides and environment variables.
if (includesBuilder.length() > 0) {
return ConfigFactory.parseString(includesBuilder.toString()).withFallback(ConfigFactory.defaultOverrides())
.withFallback(ConfigFactory.systemEnvironment()).resolve().getStringList(INCLUDES_KEY_NAME);
}
return Collections.emptyList();
}
/**
* Retrieves the {@link Config} for the given {@link ConfigKeyPath} by reading the {@link #MAIN_CONF_FILE_NAME}
* associated with the dataset specified by the given {@link ConfigKeyPath}. If the {@link Path} described by the
* {@link ConfigKeyPath} does not exist then an empty {@link Config} is returned.
*
* @param configKey the config key path whose properties are needed.
* @param version the configuration version in the configuration store.
*
* @return a {@link Config} for the given configKey.
*
* @throws VersionDoesNotExistException if the version specified cannot be found in the {@link ConfigStore}.
*/
@Override
public Config getOwnConfig(ConfigKeyPath configKey, String version) throws VersionDoesNotExistException {
Preconditions.checkNotNull(configKey, "configKey cannot be null!");
Preconditions.checkArgument(!Strings.isNullOrEmpty(version), "version cannot be null or empty!");
Path datasetDir = getDatasetDirForKey(configKey, version);
Path mainConfFile = new Path(datasetDir, MAIN_CONF_FILE_NAME);
try {
if (!this.fs.exists(mainConfFile)) {
return ConfigFactory.empty();
}
FileStatus configFileStatus = this.fs.getFileStatus(mainConfFile);
if (!configFileStatus.isDirectory()) {
try (InputStream mainConfInputStream = this.fs.open(configFileStatus.getPath())) {
return ConfigFactory.parseReader(new InputStreamReader(mainConfInputStream, Charsets.UTF_8));
}
}
return ConfigFactory.empty();
} catch (IOException e) {
throw new RuntimeException(String.format("Error while getting config for configKey: \"%s\"", configKey), e);
}
}
/**
* Retrieves the dataset dir on HDFS associated with the given {@link ConfigKeyPath} and the given version. This
* directory contains the {@link #MAIN_CONF_FILE_NAME} and {@link #INCLUDES_CONF_FILE_NAME} file, as well as any child
* datasets.
*/
private Path getDatasetDirForKey(ConfigKeyPath configKey, String version) throws VersionDoesNotExistException {
String datasetFromConfigKey = getDatasetFromConfigKey(configKey);
if (StringUtils.isBlank(datasetFromConfigKey)) {
return getVersionRoot(version);
}
return new Path(getVersionRoot(version), datasetFromConfigKey);
}
/**
* Retrieves the name of a dataset from a given {@link ConfigKeyPath}, relative to the store root.
*/
private static String getDatasetFromConfigKey(ConfigKeyPath configKey) {
return StringUtils.removeStart(configKey.getAbsolutePathString(), SingleLinkedListConfigKeyPath.PATH_DELIMETER);
}
/**
* Constructs a {@link Path} that points to the location of the given version of the {@link ConfigStore} on HDFS. If
* this {@link Path} does not exist, a {@link VersionDoesNotExistException} is thrown.
*/
private Path getVersionRoot(String version) throws VersionDoesNotExistException {
try {
return this.versions.get(version, new VersionRootLoader(version));
} catch (ExecutionException e) {
throw new RuntimeException(
String.format("Error while checking if version \"%s\" for store \"%s\" exists", version, getStoreURI()), e);
}
}
/**
* Implementation of {@link Callable} that finds the root {@link Path} of a specified version. To be used in
* conjunction with the {@link #versions} cache.
*/
@AllArgsConstructor
private class VersionRootLoader implements Callable<Path> {
private String version;
@Override
public Path call() throws IOException {
Path versionRootPath = PathUtils.combinePaths(SimpleHDFSConfigStore.this.physicalStoreRoot.toString(),
CONFIG_STORE_NAME, this.version);
if (SimpleHDFSConfigStore.this.fs.isDirectory(versionRootPath)) {
return versionRootPath;
}
throw new VersionDoesNotExistException(getStoreURI(), this.version,
String.format("Cannot find specified version under root %s", versionRootPath));
}
}
/**
* Implementation of {@link Function} that translates a {@link String} in an {@link #INCLUDES_CONF_FILE_NAME} file to
* a {@link ConfigKeyPath}.
*/
private static class IncludesToConfigKey implements Function<String, ConfigKeyPath> {
@Override
public ConfigKeyPath apply(String input) {
if (input == null) {
return null;
}
ConfigKeyPath configKey = SingleLinkedListConfigKeyPath.ROOT;
for (String file : Splitter.on(SingleLinkedListConfigKeyPath.PATH_DELIMETER).omitEmptyStrings().split(input)) {
configKey = configKey.createChild(file);
}
return configKey;
}
}
/**
* Deploy configs provided by {@link FsDeploymentConfig#getDeployableConfigSource()} to HDFS.
* For each {@link ConfigStream} returned by {@link DeployableConfigSource#getConfigStreams()}, creates a resource on HDFS.
* <br>
* <ul> Does the following:
* <li> Read {@link ConfigStream}s and write them to HDFS
* <li> Create parent directories of {@link ConfigStream#getConfigPath()} if required
* <li> Set {@link FsDeploymentConfig#getStorePermissions()} to all resourced created on HDFS
* <li> Update current active version in the store metadata file.
* </ul>
*
* <p>
* For example: If "test-root" is a resource in classpath and all resources under it needs to be deployed,
* <br>
* <br>
* <b>In Classpath:</b><br>
* <blockquote> <code>
* test-root<br>
* /data<br>
* /set1<br>
* /main.conf<br>
* /tag<br>
* /tag1<br>
* /main.conf<br>
* </code> </blockquote>
* </p>
*
* <p>
* A new version 2.0.0 {@link FsDeploymentConfig#getNewVersion()} is created on HDFS under <code>this.physicalStoreRoot/_CONFIG_STORE</code>
* <br>
* <br>
* <b>On HDFS after deploy:</b><br>
* <blockquote> <code>
* /_CONFIG_STORE<br>
* /2.0.0<br>
* /data<br>
* /set1<br>
* /main.conf<br>
* /tag<br>
* /tag1<br>
* /main.conf<br>
* </code> </blockquote>
* </p>
*
*/
@Override
public void deploy(FsDeploymentConfig deploymentConfig) throws IOException {
log.info("Deploying with config : " + deploymentConfig);
Path hdfsconfigStoreRoot = new Path(this.physicalStoreRoot.getPath(), CONFIG_STORE_NAME);
if (!this.fs.exists(hdfsconfigStoreRoot)) {
throw new IOException("Config store root not present at " + this.physicalStoreRoot.getPath());
}
Path hdfsNewVersionPath = new Path(hdfsconfigStoreRoot, deploymentConfig.getNewVersion());
if (!this.fs.exists(hdfsNewVersionPath)) {
this.fs.mkdirs(hdfsNewVersionPath, deploymentConfig.getStorePermissions());
Set<ConfigStream> confStreams = deploymentConfig.getDeployableConfigSource().getConfigStreams();
for (ConfigStream confStream : confStreams) {
String confAtPath = confStream.getConfigPath();
log.info("Copying resource at : " + confAtPath);
Path hdsfConfPath = new Path(hdfsNewVersionPath, confAtPath);
if (!this.fs.exists(hdsfConfPath.getParent())) {
this.fs.mkdirs(hdsfConfPath.getParent());
}
// If an empty directory needs to created it may not have a stream.
if (confStream.getInputStream().isPresent()) {
// Read the resource as a stream from the classpath and write it to HDFS
try (SeekableFSInputStream inputStream = new SeekableFSInputStream(confStream.getInputStream().get());
FSDataOutputStream os = this.fs.create(hdsfConfPath, false)) {
StreamUtils.copy(inputStream, os);
}
}
}
// Set permission for newly copied files
for (FileStatus fileStatus : FileListUtils.listPathsRecursively(this.fs, hdfsNewVersionPath,
FileListUtils.NO_OP_PATH_FILTER)) {
this.fs.setPermission(fileStatus.getPath(), deploymentConfig.getStorePermissions());
}
} else {
log.warn(String.format(
"STORE WITH VERSION %s ALREADY EXISTS. NEW RESOURCES WILL NOT BE COPIED. ONLY STORE MEATADATA FILE WILL BE UPDATED TO %s",
deploymentConfig.getNewVersion(), deploymentConfig.getNewVersion()));
}
this.storeMetadata.setCurrentVersion(deploymentConfig.getNewVersion());
log.info(String.format("New version %s of config store deployed at %s", deploymentConfig.getNewVersion(),
hdfsconfigStoreRoot));
}
@VisibleForTesting
URI getPhysicalStoreRoot() {
return this.physicalStoreRoot;
}
}