/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.kvstore.lib; import java.io.IOException; import java.util.List; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.hadoop.io.AvroKeyValue; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.kvstore.KeyValueStoreReader; import org.kiji.mapreduce.kvstore.framework.KeyValueStoreConfiguration; /** * An interface for providing read access to Avro container files of (key, value) * records. * * <p>This KeyValueStore provides lookup access to an Avro container file by reading * the entire file into memory. The Avro file is assumed to contain records with (at * least) two fields, named "key" and "value." This store will decompose the top-level * record into its two fields, and index the "value" field by the key.</p> * * <h3>XML Configuration</h3> * * <p>A kvstores XML file may contain the following properties when specifying the * behavior of this class:</p> * <ul> * <li><tt>dcache</tt> - True if files should be accessed by jobs through the DistributedCache. * <li><tt>paths</tt> - A comma-separated list of HDFS paths to files backing this store. * <li><tt>avro.reader.schema</tt> - The reader schema to apply to records in the * input file(s).</li> * </ul> * * <h3>Default values</h3> * * <ul> * <li>By default, use of the DistributedCache is enabled.</li> * <li>You must specify the paths to read. It is an error to leave this unconfigured.</li> * <li>Files will be read using a new <tt>Configuration</tt> object if you do not specify * your own.</li> * <li>If you do not specify an Avro reader schema, the writer schema from each * file will be used. (In this case, it is expected that these files have the same writer * schema.) The schema of each datum in your Avro files must be a record that * contains at least two fields, called <tt>key</tt> and <tt>value</tt>. * The schema of these two fields is up to you. * Any additional fields in a top-level record will be ignored. It is an error for your * schema to not include these fields; see the {@link AvroRecordKeyValueStore} for a more * free-form record-based Avro KeyValueStore.</li> * </ul> * * @param <K> The type of the key field. * @param <V> The type of the value field. */ @ApiAudience.Public @ApiStability.Evolving public final class AvroKVRecordKeyValueStore<K, V> implements Configurable, KeyValueStore<K, V> { /** The configuration variable for the name of the field to use as the lookup key. */ private static final String KEY_FIELD = "key"; /** A wrapped store for looking up an Avro record by its 'key' field. */ private final AvroRecordKeyValueStore<K, GenericRecord> mStore; /** true if the user has called open(); cannot call initFromConf() after that. */ private boolean mOpened; /** * A Builder-pattern class that configures and creates new AvroKVRecordKeyValueStore * instances. You should use this to specify the input to this KeyValueStore. * Call the build() method to return a new, configured AvroKVRecordKeyValueStore instance. */ @ApiAudience.Public @ApiStability.Evolving public static final class Builder { private AvroRecordKeyValueStore.Builder mAvroRecordStoreBuilder; /** * Private, default constructor. Call the builder() method of this KeyValueStore * to get a new builder instance. */ private Builder() { mAvroRecordStoreBuilder = AvroRecordKeyValueStore.builder(); mAvroRecordStoreBuilder.withKeyFieldName(AvroKeyValue.KEY_FIELD); } /** * Sets the schema to read the records with. * This may be null; the schema used when writing the input files will be used directly. * * @param schema The reader schema. * @return This builder instance. */ public Builder withReaderSchema(Schema schema) { mAvroRecordStoreBuilder.withReaderSchema(schema); return this; } /** * Sets the Hadoop configuration instance to use. * * @param conf The configuration. * @return This builder instance. */ public Builder withConfiguration(Configuration conf) { mAvroRecordStoreBuilder.withConfiguration(conf); return this; } /** * Adds a path to the list of files to load. * * @param path The input file/directory path. * @return This builder instance. */ public Builder withInputPath(Path path) { mAvroRecordStoreBuilder.withInputPath(path); return this; } /** * Replaces the current list of files to load with the set of files * specified as an argument. * * @param paths The input file/directory paths. * @return This builder instance. */ public Builder withInputPaths(List<Path> paths) { mAvroRecordStoreBuilder.withInputPaths(paths); return this; } /** * Sets a flag indicating the use of the DistributedCache to distribute * input files. * * @param enabled true if the DistributedCache should be used, false otherwise. * @return This builder instance. */ public Builder withDistributedCache(boolean enabled) { mAvroRecordStoreBuilder.withDistributedCache(enabled); return this; } /** * Build a new AvroKVRecordKeyValueStore instance. * * @param <K> the key type used to look up each record. * @param <V> the value type returned by each record. * @return the initialized KeyValueStore. */ public <K, V> AvroKVRecordKeyValueStore<K, V> build() { return new AvroKVRecordKeyValueStore<K, V>(this); } } /** * Creates a new AvroKVRecordKeyValueStore.Builder instance that can be used * to configure and create a new KeyValueStore. * * @return a new Builder instance. */ public static Builder builder() { return new Builder(); } /** * Constructs an AvroKVRecordKeyValueStore from a builder. * * @param builder the builder instance to configure from. */ private AvroKVRecordKeyValueStore(Builder builder) { mStore = builder.mAvroRecordStoreBuilder.build(); } /** * Reflection-only constructor. Used only for reflection. You should create and configure * AvroKVRecordKeyValueStore instances by using a builder; * call AvroKVRecordKeyValueStore.builder() to get a new builder instance. */ public AvroKVRecordKeyValueStore() { this(builder()); } /** {@inheritDoc} */ @Override public void setConf(Configuration conf) { if (mOpened) { // Don't allow mutation after we start using this store for reads. throw new IllegalStateException( "Cannot set the configuration after a reader has been opened"); } mStore.setConf(conf); } /** {@inheritDoc} */ @Override public Configuration getConf() { return mStore.getConf(); // This creates a new Configuration for return. } /** {@inheritDoc} */ @Override public void storeToConf(KeyValueStoreConfiguration conf) throws IOException { mStore.storeToConf(conf); } /** {@inheritDoc} */ @Override public void initFromConf(KeyValueStoreConfiguration conf) throws IOException { if (mOpened) { throw new IllegalStateException("Cannot reinitialize; already opened a reader."); } // By convention, we always use "key" as the field to access in an AvroKVRecord-based store. // When initializing the underlying AvroRecordKeyValueStore from our kvstores.xml, // ensure that this field is set as such. conf.set(AvroRecordKeyValueStore.CONF_KEY_FIELD_KEY, KEY_FIELD); mStore.initFromConf(conf); } /** {@inheritDoc} */ @Override public KeyValueStoreReader<K, V> open() throws IOException { mOpened = true; return new Reader<K, V>(mStore); } /** * Reads an entire Avro container file of (key, value) records into memory, indexed * by "key." * * <p>Lookups for a key <i>K</i> will return the "value" field of the first record * in the file where the key field has value <i>K</i>.</p> */ @ApiAudience.Private static final class Reader<K, V> implements KeyValueStoreReader<K, V> { /** A wrapped Avro store reader for looking up a record by its 'key' field. */ private final KeyValueStoreReader<K, GenericRecord> mReader; /** * Constructs a key value reader over an Avro file. * * @param store An Avro file store that uses the 'key' field as the key, and * the entire record as the value. * @throws IOException If there is an error. */ public Reader(AvroRecordKeyValueStore<K, GenericRecord> store) throws IOException { mReader = store.open(); } /** {@inheritDoc} */ @Override public boolean isOpen() { return mReader.isOpen(); } /** {@inheritDoc} */ @Override @SuppressWarnings("unchecked") public V get(K key) throws IOException { GenericRecord record = mReader.get(key); if (null == record) { // No match; return null; } return (V) record.get(AvroKeyValue.VALUE_FIELD); } /** {@inheritDoc} */ @Override public boolean containsKey(K key) throws IOException { return mReader.containsKey(key); } /** {@inheritDoc} */ @Override public void close() throws IOException { mReader.close(); } } }