/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.bulkimport; import java.io.IOException; import java.util.Collections; import java.util.Map; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.annotations.Inheritance; import org.kiji.mapreduce.KijiTableContext; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.kvstore.KeyValueStoreClient; /** * <p>Base class for all Kiji bulk importers. Subclasses of KijiBulkImporter can be * passed to the --importer flag of a <code>kiji bulk-import</code> command.</p> * * <p>To implement your own bulk importer, extend KijiBulkImporter and implement the * {@link #produce(Object, Object, KijiTableContext)} method to process * your input. To write data to Kiji, call the appropriate <code>put()</code> method of the * {@link KijiTableContext}.</p> * * <h1>Lifecycle:</h1> * * <p>Internal state is set by a call to setConf(). Thus, KijiBulkImporters will be * automagically initialized by hadoop's ReflectionUtils.</p> * * <p> * As a {@link KeyValueStoreClient}, KijiBulkImporter will have access to all * stores defined by {@link KeyValueStoreClient#getRequiredStores()}. Readers for * these stores are surfaced in the setup(), produce(), and cleanup() methods * via the Context provided to each by calling * {@link org.kiji.mapreduce.KijiContext#getStore(String)}. * </p> * * <p>Once the internal state is set, functions may be called in any order, except for * restrictions on setup(), produce(), and cleanup().</p> * * <p>setup() will get called once at the beginning of the map phase, followed by * a call to produce() for each input key-value pair. Once all of these produce() * calls have completed, cleanup() will be called exactly once. It is possible * that this setup-produce-cleanup cycle may repeat any number of times.</p> * * <p>A final guarantee is that setup(), produce(), and cleanup() will be called after * getOutputColumn() has been called at least once.</p> * * <h1>Skeleton:</h1> * <p> * Any concrete implementation of a KijiBulkImporter must implement the {@link #produce} method. * An example of a bulk importer that parses a colon delimited mappings of strings to integers: * </p> * <pre><code> * public void produce(LongWritable filePos, Text value, KijiTableContext context) * throws IOException { * final String[] split = value.toString().split(":"); * final String rowKey = split[0]; * final int integerValue = Integer.parseInt(split[1]); * final EntityId eid = context.getEntityId(rowKey); * context.put(eid, "primitives", "int", integerValue); * } * </code></pre> * * @param <K> The type of the MapReduce input key, which will depend on the input format used. * @param <V> The type of the MapReduce input value, which will depend on the input format used. */ @ApiAudience.Public @ApiStability.Stable @Inheritance.Extensible public abstract class KijiBulkImporter<K, V> implements Configurable, KeyValueStoreClient { /** The Configuration of this producer. */ private Configuration mConf; /** * Your subclass of KijiBulkImporter must have a default constructor if it is to be used * in a bulk import job. The constructors should be lightweight, since the framework is * free to create KijiBulkImporters at any time. */ public KijiBulkImporter() { mConf = new Configuration(); } /** * {@inheritDoc} * <p>If you override this method for your bulk importer, you must call super.setConf(); or the * configuration will not be saved properly.</p> */ @Override public void setConf(Configuration conf) { mConf = conf; } /** * {@inheritDoc} * <p>Overriding this method without returning super.getConf() may cause undesired behavior.</p> */ @Override public Configuration getConf() { return mConf; } /** {@inheritDoc} */ @Override public Map<String, KeyValueStore<?, ?>> getRequiredStores() { return Collections.emptyMap(); } /** * Called once to initialize this bulk importer before any calls to * {@link #produce(Object, Object, KijiTableContext)}. * * @param context A context you can use to generate EntityIds and commit writes. * See {@link KijiTableContext#getEntityId(Object...)}. * @throws IOException on I/O error. */ public void setup(KijiTableContext context) throws IOException { // By default, do nothing. Nothing may be added here, because subclasses may implement setup // methods without super.setup(). } /** * Produces data to be imported into Kiji. * * <p>Produce is called once for each key-value pair record from the raw input data. To import * this data to Kiji, use the context.put(...) methods inherited from * {@link org.kiji.schema.KijiPutter} to specify a cell address and value to write. One * execution of produce may include multiple calls to put, which may span multiple rows, columns * and locality groups if desired. The context provides a * {@link KijiTableContext#getEntityId(Object...)} method for generating EntityIds from input * data.</p> * * @param key The MapReduce input key (its type depends on the InputFormat you use). * @param value The MapReduce input value (its type depends on the InputFormat you use). * @param context A context you can use to generate EntityIds and commit writes. * See {@link KijiTableContext#getEntityId(Object...)} and * {@link org.kiji.schema.KijiPutter#put(org.kiji.schema.EntityId, String, String, Object)}. * @throws IOException on I/O error. */ public abstract void produce(K key, V value, KijiTableContext context) throws IOException; /** * Called once to clean up this bulk importer after all * {@link #produce(Object, Object, KijiTableContext)} calls are made. * * @param context A context you can use to generate EntityIds and commit writes. * See {@link KijiTableContext#getEntityId(Object...)}. * @throws IOException on I/O error. */ public void cleanup(KijiTableContext context) throws IOException { // By default, do nothing. Nothing may be added here, because subclasses may implement setup // methods without super.cleanup(). } }