/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.gather;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.annotations.Inheritance;
import org.kiji.mapreduce.KVOutputJob;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.kvstore.KeyValueStoreClient;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRowData;
/**
* <p>
* Base class for all Kiji Gatherers. Gatherers are jobs that scan a
* kiji table and aggregate data into an external file.
* </p>
*
* <h1>Lifecycle:</h1>
*
* <p>
* Instance are created using ReflectionUtils, so the {@link
* org.apache.hadoop.conf.Configuration} is automagically set immediately after
* instantiation with a call to setConf(). In order to initialize internal state
* before any other methods are called, override the setConf() method.
* </p>
*
* <p>
* As a {@link KeyValueStoreClient}, KijiGatherers will have access to all
* stores defined by {@link KeyValueStoreClient#getRequiredStores()}. Readers for
* these stores are surfaced in the setup(), produce(), and cleanup() methods
* via the Context provided to each by calling
* {@link org.kiji.mapreduce.KijiContext#getStore(String)}.
* </p>
*
* <p>
* Once the internal state is set, functions may be called in any order, except for
* restrictions on setup(), gather(), and cleanup().
* </p>
*
* <p>
* setup() will get called once at the beginning of the map phase, followed by
* a call to gather() for each input row. Once all of these gather()
* calls have completed, cleanup() will be called exactly once. It is possible
* that this setup-gather-cleanup cycle may repeat any number of times.
* </p>
*
* <h1>Skeleton:</h1>
* <p>
* Any concrete implementation of a KijiGatherer must implement the {@link #getDataRequest()},
* {@link #gather}, {@link #getOutputKeyClass()}, and {@link #getOutputValueClass()} methods.
* An example of a gather method that counts the domains from the email field of each row:
* </p>
* <pre><code>
* public void gather(KijiRowData input, GathererContext context)
* throws IOException {
* if (!input.containsColumn("info", "email")) {
* return;
* }
* String email = input.getMostRecentValue("info", "email").toString();
* int atSymbol = email.indexOf("@");
* String domain = email.substring(atSymbol + 1);
* mDomain.set(domain);
* context.write(mDomain, ONE);
* }
* </code></pre>
* For the entire code for this gatherer, check out EmailDomainCountGatherer in KijiMR Lib.
*
* @param <K> The type of the output key from the gatherer.
* @param <V> The type of the output value from the gatherer.
*/
@ApiAudience.Public
@ApiStability.Stable
@Inheritance.Extensible
public abstract class KijiGatherer<K, V>
implements Configurable, KeyValueStoreClient, KVOutputJob {
/** The Configuration for this instance. */
private Configuration mConf;
/**
* Sets the Configuration for this instance.
* This method will be called immediately after instantiation.
* Override this method to initialize internal state from a configuration.
*
* <p>If you override this method for your gatherer, you must call super.setConf(); or the
* configuration will not be saved properly.</p>
*
* @param conf The Configuration to use.
*/
@Override
public void setConf(Configuration conf) {
mConf = conf;
}
/**
* {@inheritDoc}
* <p>Overriding this method without returning super.getConf() may cause undesired behavior.</p>
*/
@Override
public Configuration getConf() {
return mConf;
}
/**
* Returns a KijiDataRequest that describes which input columns need to be available to
* the gatherer. This method may be called multiple times, perhaps before {@link
* #setup(org.kiji.mapreduce.gather.GathererContext)}.
*
* @return a kiji data request.
*/
public abstract KijiDataRequest getDataRequest();
/**
* Called once to initialize the gatherer before any calls to gather().
* You may override this to add any one-time setup operations to your gather job.
*
* @param context A gatherer context used to write key/value output, access stores, etc.
* @throws IOException if there is an error.
*/
public void setup(GathererContext<K, V> context) throws IOException {
// By default, do nothing. Nothing may be added here, because subclasses may implement setup
// methods without super.setup().
}
/**
* Called once per row in the kiji table.
*
* @param input A single for of input from the Kiji table, filled with the data requested.
* @param context A gatherer context, used to write key/value output.
* @throws IOException If there is an error.
*/
public abstract void gather(KijiRowData input, GathererContext<K, V> context)
throws IOException;
/**
* Called once to dispose of any resources used by the gatherer after all calls to gather().
* You may override this to add any final cleanup operations to your gather job.
*
* @param context A gatherer context used to write key/value output, access stores, etc.
* @throws IOException if there is an error.
*/
public void cleanup(GathererContext<K, V> context) throws IOException {
// By default, do nothing. Nothing may be added here, because subclasses may implement setup
// methods without super.cleanup().
}
/** {@inheritDoc} */
@Override
public Map<String, KeyValueStore<?, ?>> getRequiredStores() {
return Collections.emptyMap();
}
}