/**
* (c) Copyright 2012 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.produce;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.annotations.Inheritance;
import org.kiji.mapreduce.KijiContext;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.kvstore.KeyValueStoreClient;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRowData;
/**
* <p>
* Base class for all Kiji Producers used to generate per-row derived
* entity data.
* </p>
*
* <h1>Lifecycle:</h1>
*
* <p>
* Instances are created using ReflectionUtils, so the {@link
* org.apache.hadoop.conf.Configuration} is automagically set immediately after
* instantiation with a call to setConf(). In order to initialize internal state
* before any other methods are called, override the setConf() method.
* </p>
*
* <p>
* As a {@link KeyValueStoreClient}, KijiProducers will have access to all
* stores defined by {@link KeyValueStoreClient#getRequiredStores()}. Readers for
* these stores are surfaced in the setup(), produce(), and cleanup() methods
* via the Context provided to each by calling {@link KijiContext#getStore(String)}.
* </p>
*
* <p>
* Once the internal state is set, functions may be called in any order, except for
* restrictions on setup(), produce(), and cleanup().
* </p>
*
* <p>
* setup() will get called once at the beginning of the map phase, followed by
* a call to produce() for each input row. Once all of these produce()
* calls have completed, cleanup() will be called exactly once. It is possible
* that this setup-produce-cleanup cycle may repeat any number of times.
* </p>
*
* <p>
* A final guarantee is that setup(), produce(), and cleanup() will be called after
* getDataRequest() and getOutputColumn() have each been called at least once.
* </p>
*
* <h1>Skeleton:</h1>
* <p>
* Any concrete implementation of a KijiProducer must implement the {@link #getDataRequest()},
* {@link #produce}, and {@link #getOutputColumn()} methods.
* An example of a produce method that extracts the domains from the email field of each row:
* </p>
* <pre><code>
* public void produce(KijiRowData input, ProducerContext context)
* throws IOException {
* if (!input.containsColumn("info", "email")) {
* return;
* }
* String email = input.getMostRecentValue("info", "email").toString();
* int atSymbol = email.indexOf("@");
*
* String domain = email.substring(atSymbol + 1);
* context.put(domain);
* }
* </code></pre>
* For the entire code for this producer, check out EmailDomainProducer in KijiMR Lib.
*/
@ApiAudience.Public
@ApiStability.Stable
@Inheritance.Extensible
public abstract class KijiProducer
implements Configurable, KeyValueStoreClient {
/** The Configuration of this producer. */
private Configuration mConf;
/**
* Your subclass of KijiProducer must have a default constructor if it is to be used in a
* KijiProduceJob. The constructors should be lightweight, since the framework is free to
* create KijiProducers at any time.
*/
public KijiProducer() {
}
/**
* Sets the Configuration for this KijiProducer to use.
* This function is guaranteed to be called immediately after instantiation.
* Override this method to initialize internal state from a configuration.
*
* <p>If you override this method for your producer, you must call super.setConf(); or the
* configuration will not be saved properly.</p>
*
* @param conf The Configuration to read.
*/
@Override
public void setConf(Configuration conf) {
mConf = conf;
}
/**
* {@inheritDoc}
* <p>Overriding this method without returning super.getConf() may cause undesired behavior.</p>
*/
@Override
public Configuration getConf() {
return mConf;
}
/**
* Returns a KijiDataRequest that describes which input columns need to be available to
* the producer. This method may be called multiple times, perhaps before {@link
* #setup(KijiContext)}.
*
* @return a kiji data request.
*/
public abstract KijiDataRequest getDataRequest();
/** {@inheritDoc} */
@Override
public Map<String, KeyValueStore<?, ?>> getRequiredStores() {
return Collections.emptyMap();
}
/**
* Return the name of the output column. An output column is of the form "family" or
* "family:qualifier". Family columns can store key/value pairs. A qualifier column
* may only contain a single piece of data.
*
* @return the output column name.
*/
public abstract String getOutputColumn();
/**
* Called once to initialize this producer before any calls to
* {@link #produce(KijiRowData, ProducerContext)}.
*
* @param context The KijiContext providing access to KeyValueStores, Counters, etc.
* @throws IOException on I/O error.
*/
public void setup(KijiContext context) throws IOException {
// By default, do nothing. Nothing may be added here, because subclasses may implement setup
// methods without super.setup().
}
/**
* Called to compute derived data for a single entity. The input that is included is controlled
* by the {@link org.kiji.schema.KijiDataRequest} returned in {@link #getDataRequest}.
*
* @param input The requested input data for the entity.
* @param context The producer context, used to output derived data.
* @throws IOException on I/O error.
*/
public abstract void produce(KijiRowData input, ProducerContext context) throws IOException;
/**
* Called once to clean up this producer after all
* {@link #produce(KijiRowData, ProducerContext)} calls are made.
*
* @param context The KijiContext providing access to KeyValueStores, Counters, etc.
* @throws IOException on I/O error.
*/
public void cleanup(KijiContext context) throws IOException {
// By default, do nothing. Nothing may be added here, because subclasses may implement setup
// methods without super.cleanup().
}
}