/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.kvstore.lib;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Formatter;
import java.util.List;
import java.util.Locale;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.kvstore.framework.KeyValueStoreConfiguration;
import org.kiji.mapreduce.util.Lists;
/**
* Helper class that manages filenames and distributed cache functionality
* for KeyValueStore implementations that work with files or collections
* of files.
*
* <p>Your KeyValueStore can use a FileStoreHelper to manage all aspects of
* configuration of and deserialization regarding file names in a MapReduce
* job.</p>
*
* <p>Create a FileStoreHelper.Builder object using the builder() method;
* your own KeyValueStore's Builder should use composition to delegate
* responsibility for Configuration, file, and distributed cache control
* to this one. You should use this object's storeToConf() and initFromConf()
* methods within your own KeyValueStore's storeToConf() and initFromConf()
* methods.</p>
*
* <p>When reading files, use getExpandedInputPaths() to get a complete list
* of files to read. If the user has enabled the distributed cache, you will
* receive a set of local files to read. Otherwise, the initial HDFS paths
* will be used.</p>
*/
@ApiAudience.Public
@ApiStability.Evolving
public final class FileStoreHelper implements Configurable {
private static final Logger LOG = LoggerFactory.getLogger(
FileStoreHelper.class.getName());
/**
* Configuration key for the KeyValueStore definition that sets whether input files are
* stored in the DistributedCache. If empty, then DCache is disabled. If non-empty,
* then DCache target file names are expected to be prefixed by the string in this
* configuration key.
*/
public static final String CONF_DCACHE_PREFIX_KEY = "dcache.prefix";
/**
* Boolean flag used in XML Configuration files only to state that the files
* specified are
* HDFS files, but should be loaded into the DistributedCache as part
* of the job. This flag is not recorded as part of addToConfiguration().
*/
public static final String CONF_USE_DCACHE_KEY = "dcache";
// This flag sets mUseDCache.
/**
* By default, it is assumed that the user wants to load this KeyValueStore
* through the DistributedCache.
*/
public static final boolean USE_DCACHE_DEFAULT = true;
/**
* Suffix for the KeyValueStore definition that is set to the list of
* input paths. This may be multiple comma-delimited paths.
*/
public static final String CONF_PATHS_KEY = "paths";
/** The Hadoop configuration. */
private Configuration mConf;
/** True if we should distribute the input files via the DistributedCache. */
private boolean mUseDCache;
/** Files stored in the distributed cache have this as their prefix. */
private String mDCachePrefix;
/** List of input paths to files to include in the store. */
private List<Path> mInputPaths;
/**
* A class that builds configured FileStoreHelper instances.
*
* <p>This object is not exposed to users directly. It is used via composition
* in other (FooFile)KeyValueStore.Builder instances. If you add a method here,
* you should reflect this via composition in the other file-backed store builder
* APIs.</p>
*/
@ApiAudience.Public
@ApiStability.Evolving
public static final class Builder {
private Configuration mConf;
private List<Path> mInputPaths;
private boolean mUseDCache;
/** private constructor. */
private Builder() {
mInputPaths = new ArrayList<Path>();
mUseDCache = USE_DCACHE_DEFAULT;
mConf = new Configuration();
}
/**
* Sets the Hadoop configuration instance to use.
*
* @param conf The configuration.
* @return This builder instance.
*/
public Builder withConfiguration(Configuration conf) {
mConf = conf;
return this;
}
/**
* Adds a path to the list of files to load.
*
* @param path The input file/directory path.
* @return This builder instance.
*/
public Builder withInputPath(Path path) {
mInputPaths.add(path);
return this;
}
/**
* Replaces the current list of files to load with the set of files
* specified as an argument.
*
* @param paths The input file/directory paths.
* @return This builder instance.
*/
public Builder withInputPaths(List<Path> paths) {
mInputPaths.clear();
mInputPaths.addAll(paths);
return this;
}
/**
* Sets a flag indicating the use of the DistributedCache to distribute
* input files.
*
* @param enabled true if the DistributedCache should be used, false otherwise.
* @return This builder instance.
*/
public Builder withDistributedCache(boolean enabled) {
mUseDCache = enabled;
return this;
}
/**
* Builds and returns a new FileStoreHelper instance.
*
* @return a new, configured FileStoreHelper.
*/
public FileStoreHelper build() {
return new FileStoreHelper(this);
}
}
/** @return a new FileStoreHelper.Builder instance. */
public static Builder builder() {
return new Builder();
}
/**
* Constructor invoked by Builder.build().
*
* @param builder the builder to configure from.
*/
private FileStoreHelper(Builder builder) {
mConf = builder.mConf;
mInputPaths = builder.mInputPaths;
mUseDCache = builder.mUseDCache;
mDCachePrefix = "";
}
/** {@inheritDoc} */
@Override
public void setConf(Configuration conf) {
mConf = conf;
}
/** {@inheritDoc} */
@Override
public Configuration getConf() {
return mConf;
}
/**
* An aggregator for use with Lists.foldLeft() that expands a list of paths that
* may include directories to include only files; directories are expanded to multiple
* file entries that are the files in this directory.
*/
@ApiAudience.Private
private final class DirExpandAggregator extends Lists.Aggregator<Path, List<Path>> {
// Note: This class could be factored out to assist other places in Kiji where we need
// to expand a list of files, dirs and globs into just a list of files, if necessary.
/** Last exception encountered during file stat lookups for the input paths. */
private IOException mLastExn;
/**
* For each input path, modify the 'outputs' list to include the path
* itself (if it is a file), or all the files in the directory (if it
* is a directory). Also expands globs with FileSystem.globStatus().
*
* @param inputPath the input path to expand.
* @param outputs list of output paths being accumulated.
* @return the 'outputs' list.
*/
@Override
public List<Path> eval(Path inputPath, List<Path> outputs) {
try {
FileSystem fs = inputPath.getFileSystem(getConf());
FileStatus[] matches = fs.globStatus(inputPath);
if (null == matches) {
mLastExn = new IOException("No such input path: " + inputPath);
} else if (matches.length == 0) {
mLastExn = new IOException("Input pattern \"" + inputPath + "\" matches 0 files.");
} else {
for (FileStatus match : matches) {
if (match.isDir()) {
// Match all the files in this dir, except the "bonus" files generated by
// MapReduce.
for (FileStatus subFile : fs.listStatus(match.getPath(),
new org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter())) {
outputs.add(subFile.getPath());
LOG.debug("Added file: " + subFile.getPath());
}
} else {
// Just a file; add directly.
outputs.add(match.getPath());
LOG.debug("Added file: " + match.getPath());
}
}
}
} catch (IOException ioe) {
mLastExn = ioe;
}
return outputs;
}
/**
* Returns the last exception encountered during FS operation while expanding
* directories.
*
* @return the last exception encountered, or null if none was encountered.
*/
private IOException getLastException() {
return mLastExn;
}
}
/**
* Returns the set of raw input path(s) that were specified for read. This may
* include wildcards or directories. You should use getExpandedInputPaths()
* to determine the set of files to actually read.
*
* @return a copy of the set of raw input paths specified for read.
*/
public List<Path> getInputPaths() {
return Collections.unmodifiableList(new ArrayList<Path>(mInputPaths));
}
/**
* Returns the set of input path(s) that should be actually opened for read.
* This set of paths may be on local disk (e.g., if the DistributedCache was used
* to transmit the files), or in HDFS. This will not contain directory names nor
* globs; it is expanded to the literal set of files to open.
*
* <p>Each Path object returned is fully qualified, and represents an absolute
* path that should be opened by its associated FileSystem object.</p>
*
* @return an unmodifiable list of input paths, backed by the underlying collection
* within this KeyValueStore.
* @throws IOException if there is an error communicating with the underlying
* FileSystem while expanding paths and globs.
*/
public List<Path> getExpandedInputPaths() throws IOException {
// If we've read a bunch of files from the DistributedCache's local dir,
// no further unglobbing is necessary. Just return the values.
if (!mDCachePrefix.isEmpty()) {
return Collections.unmodifiableList(mInputPaths);
}
// Otherwise, these are "raw" user inputs. Unglob and expand them.
DirExpandAggregator expander = new DirExpandAggregator();
List<Path> actualInputPaths = Lists.distinct(Lists.foldLeft(
new ArrayList<Path>(), mInputPaths, expander));
IOException savedException = expander.getLastException();
if (null != savedException) {
// Rethrow the saved exception from this context.
throw savedException;
}
return Collections.unmodifiableList(actualInputPaths);
}
/**
* If the cache URI prefix is already set, return this value. Otherwise create
* a new unique cache URI prefix. This does not memoize its return value;
* if mDCachePrefix is empty/null, multiple calls to this method will return
* unique values.
*
* @return the DistributedCache URI prefix for files used by this store.
*/
private String getCachePrefix() {
if (mDCachePrefix.isEmpty()) {
// We need to put the files for this KVStore into the distributed cache. They
// should be given symlink names that do not conflict with the names associated
// with other KeyValueStores. Pick a symlink prefix that is unique to this store.
long prefixId = System.currentTimeMillis() ^ (((long) this.hashCode()) << 8);
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb, Locale.US);
formatter.format("%08x", prefixId);
String newPrefix = sb.toString();
LOG.debug("This KeyValueStore uses Distributed cache files in namespace: " + newPrefix);
return newPrefix;
} else {
return mDCachePrefix; // Prefix is already set.
}
}
/**
* Serializes file- and DistributedCache-specific properties associated
* with the KeyValueStore that owns this FileStoreHelper to the specified configuration.
*
* @param conf the configuration to populate.
* @throws IOException if there's an error serializing the state.
*/
public void storeToConf(KeyValueStoreConfiguration conf) throws IOException {
if (mInputPaths.isEmpty()) {
throw new IOException("Required attribute not set: input path");
}
if (mUseDCache && !"local".equals(conf.get("mapreduce.jobtracker.address", ""))) {
// If we're scheduled to use the distributed cache, and we're not in the LocalJobRunner,
// add these files to the DistributedCache.
// TODO(aaron): This does not handle any sort of MapperTester, etc.
// We need a separate flag that tells this to ignore mUseDCache if we're in a test
// environment, and just use the original input file specs.
final String dCachePrefix = getCachePrefix();
// Associate this randomly chosen prefix id with this KVStore implementation.
conf.set(CONF_DCACHE_PREFIX_KEY, dCachePrefix);
// Add the input paths to the DistributedCache and translate path names.
int uniqueId = 0;
// TODO: getExpandedInputPaths() should use the Configuration from conf, not our getConf().
for (Path inputPath : getExpandedInputPaths()) {
FileSystem fs = inputPath.getFileSystem(conf.getDelegate());
Path absolutePath = inputPath.makeQualified(fs);
String uriStr = absolutePath.toString() + "#" + dCachePrefix + "-" + uniqueId;
LOG.debug("Adding to DistributedCache: " + uriStr);
uniqueId++;
try {
DistributedCache.addCacheFile(new URI(uriStr), conf.getDelegate());
} catch (URISyntaxException use) {
throw new IOException("Could not construct URI for file: " + uriStr, use);
}
}
// Ensure that symlinks are created for cached files.
DistributedCache.createSymlink(conf.getDelegate());
// Now save the cache prefix into the local state. We couldn't set this earlier,
// because we wanted getExpandedInputPaths() to actually unglob things. That
// function will behave differently if mDCachePrefix is already initialized.
mDCachePrefix = dCachePrefix;
} else {
// Just put the regular HDFS paths in the Configuration.
conf.setStrings(CONF_PATHS_KEY,
Lists.toArray(Lists.map(mInputPaths, new Lists.ToStringFn<Path>()), String.class));
}
}
/**
* Deserializes file- and DistributedCache-specific properties associated
* with the KeyValueStore that owns this FileStoreHelper from the specified configuration.
*
* <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration
* instance to use when opening files specified by this configuration.</p>
*
* @param conf the configuration to read.
* @throws IOException if there's an error deserializing the configuration.
*/
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
setConf(conf.getDelegate());
mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, "");
LOG.debug("Input dCachePrefix: " + mDCachePrefix);
if (mDCachePrefix.isEmpty()) {
// Read an ordinary list of files from the Configuration.
// These may include directories and globs to expand.
mInputPaths = Lists.map(Arrays.asList(conf.getStrings(
CONF_PATHS_KEY, new String[0])),
new Lists.Func<String, Path>() {
@Override
public Path eval(String in) {
LOG.debug("File input: " + in);
return new Path(in);
}
});
} else {
// Use the dcache prefix to get the names of the files for this store.
// The symlinks are already present in the working dir of the task.
final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
if (null == statuses || statuses.length == 0) {
throw new IOException("No files associated with the job in the DistributedCache");
}
// Get the (absolute) input file paths to use.
mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
@Override
public Path eval(FileStatus status) {
Path out = status.getPath().makeQualified(localFs);
LOG.debug("Loaded from DistributedCache: " + out);
return out;
}
});
}
// If we are initializing a client-side instance to later serialize, the user may have
// specified HDFS files, but also an intent to put the files in the DistributedCache. Set
// this flag now, which will generate mDCachePrefix when addToConfiguration() is called
// later.
mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT);
}
}