/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.kvstore.lib;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.util.ReflectionUtils;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.kvstore.KeyValueStoreReader;
import org.kiji.mapreduce.kvstore.framework.KeyValueStoreConfiguration;
/**
* KeyValueStore implementation that reads records from SequenceFiles.
*
* <h3>XML Configuration</h3>
*
* <p>When specifying a SeqFileKeyValueStore in a kvstores XML file, you may
* specify the following properties:</p>
* <ul>
* <li><tt>dcache</tt> - True if files should be accessed by jobs through the DistributedCache.
* <li><tt>paths</tt> - A comma-separated list of HDFS paths to files backing this store.
* </ul>
*
* <h3>Default values</h3>
*
* <ul>
* <li>By default, use of the DistributedCache is enabled.</li>
* <li>You must specify the paths to read. It is an error to leave this unconfigured.</li>
* <li>Files will be read using a new <tt>Configuration</tt> object if you do not specify
* your own.</li>
* </ul>
*
* @param <K> The type of the key field stored in the SequenceFile(s).
* @param <V> The type of value field stored in the SequenceFile(s).
*/
@ApiAudience.Public
@ApiStability.Evolving
public final class SeqFileKeyValueStore<K, V> implements Configurable, KeyValueStore<K, V> {
/** Helper object to manage backing files. */
private final FileStoreHelper mFileHelper;
/** true if the user has called open(); cannot call initFromConf() after that. */
private boolean mOpened;
/**
* A Builder-pattern class that configures and creates new SeqFileKeyValueStore
* instances. You should use this to specify the input to this KeyValueStore.
* Call the build() method to return a new, configured SeqFileKeyValueStore instance.
*/
@ApiAudience.Public
@ApiStability.Evolving
public static final class Builder {
private FileStoreHelper.Builder mFileBuilder;
/**
* Private, default constructor. Call the builder() method of this KeyValueStore
* to get a new builder instance.
*/
private Builder() {
mFileBuilder = FileStoreHelper.builder();
}
/**
* Sets the Hadoop configuration instance to use.
*
* @param conf The configuration.
* @return This builder instance.
*/
public Builder withConfiguration(Configuration conf) {
mFileBuilder.withConfiguration(conf);
return this;
}
/**
* Adds a path to the list of files to load.
*
* @param path The input file/directory path.
* @return This builder instance.
*/
public Builder withInputPath(Path path) {
mFileBuilder.withInputPath(path);
return this;
}
/**
* Replaces the current list of files to load with the set of files
* specified as an argument.
*
* @param paths The input file/directory paths.
* @return This builder instance.
*/
public Builder withInputPaths(List<Path> paths) {
mFileBuilder.withInputPaths(paths);
return this;
}
/**
* Sets a flag indicating the use of the DistributedCache to distribute
* input files.
*
* @param enabled true if the DistributedCache should be used, false otherwise.
* @return This builder instance.
*/
public Builder withDistributedCache(boolean enabled) {
mFileBuilder.withDistributedCache(enabled);
return this;
}
/**
* Build a new SeqFileKeyValueStore instance.
*
*
* @param <K> The type of the key field stored in the SequenceFile(s).
* @param <V> The type of value field stored in the SequenceFile(s).
* @return the initialized KeyValueStore.
*/
public <K, V> SeqFileKeyValueStore<K, V> build() {
return new SeqFileKeyValueStore<K, V>(this);
}
}
/**
* Creates a new SeqFileKeyValueStore.Builder instance that can be used
* to configure and create a new KeyValueStore.
*
* @return a new Builder instance.
*/
public static Builder builder() {
return new Builder();
}
/**
* Reflection-only constructor. Used only for reflection. You should create and configure
* new SeqFileKeyValueStore instances by using a builder;
* call SeqFileKeyValueStore.builder() to get a new builder instance.
*/
public SeqFileKeyValueStore() {
this(builder());
}
/**
* Main constructor used by the builder; create a new SeqFileKeyValueStore to read SequenceFiles.
*
* @param builder the builder to configure from.
*/
private SeqFileKeyValueStore(Builder builder) {
mFileHelper = builder.mFileBuilder.build();
}
/** {@inheritDoc} */
@Override
public void setConf(Configuration conf) {
if (mOpened) {
// Don't allow mutation after we start using this store for reads.
throw new IllegalStateException(
"Cannot set the configuration after a reader has been opened");
}
mFileHelper.setConf(conf);
}
/** {@inheritDoc} */
@Override
public Configuration getConf() {
return new Configuration(mFileHelper.getConf());
}
/** {@inheritDoc} */
@Override
public void storeToConf(KeyValueStoreConfiguration conf) throws IOException {
mFileHelper.storeToConf(conf);
}
/** {@inheritDoc} */
@Override
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
if (mOpened) {
throw new IllegalStateException("Cannot reinitialize; already opened a reader.");
}
mFileHelper.initFromConf(conf);
}
/** @return the raw input paths specified as input by the user. */
public List<Path> getInputPaths() {
// Visible chiefly for testing.
return mFileHelper.getInputPaths();
}
/** {@inheritDoc} */
@Override
public KeyValueStoreReader<K, V> open() throws IOException {
mOpened = true;
return new Reader(mFileHelper.getConf(), mFileHelper.getExpandedInputPaths());
}
/**
* Reads an entire SequenceFile of records into memory, and indexes it by the key field.
*
* <p>Lookups for a key <i>K</i> will return the first record in the file where the key field
* has value <i>K</i>.</p>
*/
@ApiAudience.Private
private final class Reader implements KeyValueStoreReader<K, V> {
/** A map from key field to its corresponding value in the SequenceFile. */
private Map<K, V> mMap;
/**
* Constructs a key value reader over a SequenceFile.
*
* @param conf The Hadoop configuration.
* @param paths The path to the sequencefile(s).
* @throws IOException If the seqfile cannot be read.
*/
@SuppressWarnings("unchecked")
public Reader(Configuration conf, List<Path> paths) throws IOException {
mMap = new HashMap<K, V>();
for (Path path : paths) {
// Load the entire SequenceFile into the lookup map.
FileSystem fs = path.getFileSystem(conf);
SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, path, conf);
try {
Class<? extends K> keyClass = (Class<? extends K>) seqReader.getKeyClass();
Class<? extends V> valClass = (Class<? extends V>) seqReader.getValueClass();
K key = ReflectionUtils.newInstance(keyClass, conf);
V val = ReflectionUtils.newInstance(valClass, conf);
key = (K) seqReader.next(key);
while (key != null) {
val = (V) seqReader.getCurrentValue(val);
if (!mMap.containsKey(key)) {
mMap.put(key, val);
}
// Get new instances of key and val to populate.
key = ReflectionUtils.newInstance(keyClass, conf);
val = ReflectionUtils.newInstance(valClass, conf);
// Load the next key; returns null if we're out of file.
key = (K) seqReader.next(key);
}
} finally {
seqReader.close();
}
}
}
/** {@inheritDoc} */
@Override
public boolean isOpen() {
return null != mMap;
}
/** {@inheritDoc} */
@Override
public V get(K key) throws IOException {
if (!isOpen()) {
throw new IOException("Reader is closed");
}
return mMap.get(key);
}
/** {@inheritDoc} */
@Override
public boolean containsKey(K key) throws IOException {
if (!isOpen()) {
throw new IOException("Reader is closed");
}
return mMap.containsKey(key);
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
mMap = null;
}
}
}