/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.gather; import java.io.IOException; import java.util.HashMap; import java.util.Map; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.mapreduce.JobConfigurationException; import org.kiji.mapreduce.KijiMapReduceJob; import org.kiji.mapreduce.KijiMapper; import org.kiji.mapreduce.KijiReducer; import org.kiji.mapreduce.MapReduceJobOutput; import org.kiji.mapreduce.framework.HFileKeyValue; import org.kiji.mapreduce.framework.KijiConfKeys; import org.kiji.mapreduce.framework.KijiTableInputJobBuilder; import org.kiji.mapreduce.gather.impl.GatherMapper; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.kvstore.KeyValueStoreClient; import org.kiji.mapreduce.output.HFileMapReduceJobOutput; import org.kiji.mapreduce.output.framework.HFileReducerMapReduceJobOutput; import org.kiji.mapreduce.reducer.IdentityReducer; import org.kiji.schema.KijiDataRequest; /** * Builds jobs that run a gatherer over a Kiji table. * * <p>Example usage:</p> * <pre><code> * MapReduceJob job = KijiGatherJobBuilder.create() * .withInputTable(myTable) * .withGatherer(MyCountGatherer.class) * .withReducer(SimpleIntSumReducer.class) * .withOutput(new TextMapReduceJobOutput("path/to/counts", numSplits)) * .build(); * boolean success = job.run(); * </code></pre> */ @SuppressWarnings("rawtypes") @ApiAudience.Public @ApiStability.Stable public final class KijiGatherJobBuilder extends KijiTableInputJobBuilder<KijiGatherJobBuilder> { private static final Logger LOG = LoggerFactory.getLogger(KijiGatherJobBuilder.class); /** The class of the gatherer to run. */ private Class<? extends KijiGatherer> mGathererClass; /** The class of the combiner to run. */ private Class<? extends KijiReducer> mCombinerClass; /** The class of the reducer to run. */ private Class<? extends KijiReducer> mReducerClass; private GatherMapper mMapper; /** The gatherer instance. */ private KijiGatherer<?, ?> mGatherer; /** The combiner instance (may be null if no combiner is specified). */ private KijiReducer<?, ?, ?, ?> mCombiner; /** The reducer instance (may be null if no reducer is specified). */ private KijiReducer<?, ?, ?, ?> mReducer; /** The data request for the job's table input. */ private KijiDataRequest mDataRequest; /** Constructs a builder for jobs that run a Kiji gatherer over a Kiji table. */ private KijiGatherJobBuilder() { mGathererClass = null; mCombinerClass = null; mReducerClass = null; mMapper = new GatherMapper(); mGatherer = null; mCombiner = null; mReducer = null; mDataRequest = null; } /** * Creates a new builder for Kiji gather jobs. * * @return a new Kiji gather job builder. */ public static KijiGatherJobBuilder create() { return new KijiGatherJobBuilder(); } /** * Configures the job with the Kiji gatherer to run in the map phase. * * @param gathererClass The gatherer class. * @return This builder instance so you may chain configuration method calls. */ public KijiGatherJobBuilder withGatherer(Class<? extends KijiGatherer> gathererClass) { mGathererClass = gathererClass; return this; } /** * Configures the job with a combiner to run (optional). * * @param combinerClass The combiner class. * @return This builder instance so you may chain configuration method calls. */ public KijiGatherJobBuilder withCombiner(Class<? extends KijiReducer> combinerClass) { mCombinerClass = combinerClass; return this; } /** * Configures the job with a reducer to run (optional). * * @param reducerClass The reducer class. * @return This builder instance so you may chain configuration method calls. */ public KijiGatherJobBuilder withReducer(Class<? extends KijiReducer> reducerClass) { mReducerClass = reducerClass; return this; } /** {@inheritDoc} */ @Override protected void configureJob(Job job) throws IOException { // Construct the gatherer instance. if (null == mGathererClass) { throw new JobConfigurationException("Must specify a gatherer."); } final Configuration conf = job.getConfiguration(); // Serialize the gatherer class name into the job configuration. conf.setClass(KijiConfKeys.KIJI_GATHERER_CLASS, mGathererClass, KijiGatherer.class); if ((getJobOutput() instanceof HFileMapReduceJobOutput) && (null == mReducerClass)) { mReducerClass = IdentityReducer.class; } final StringBuilder name = new StringBuilder("Kiji gather: " + mGathererClass.getSimpleName()); if (null != mReducerClass) { name.append(" / " + mReducerClass.getSimpleName()); } job.setJobName(name.toString()); mGatherer = ReflectionUtils.newInstance(mGathererClass, conf); mMapper.setConf(conf); mDataRequest = mGatherer.getDataRequest(); // Construct the combiner instance (if specified). if (null != mCombinerClass) { mCombiner = ReflectionUtils.newInstance(mCombinerClass, conf); } // Construct the reducer instance (if specified). if (null != mReducerClass) { mReducer = ReflectionUtils.newInstance(mReducerClass, conf); } // Configure the table input job (requires mGatherer, mMapper and mReducer to be set): super.configureJob(job); // Some validation: if (getJobOutput() instanceof HFileMapReduceJobOutput) { if (mReducer instanceof IdentityReducer) { Preconditions.checkState( mGatherer.getOutputKeyClass() == HFileKeyValue.class, String.format( "Gatherer '%s' writing HFiles must output HFileKeyValue keys, but got '%s'", mGathererClass.getName(), mGatherer.getOutputKeyClass().getName())); Preconditions.checkState( mGatherer.getOutputValueClass() == NullWritable.class, String.format( "Gatherer '%s' writing HFiles must output NullWritable values, but got '%s'", mGathererClass.getName(), mGatherer.getOutputValueClass().getName())); } Preconditions.checkState( mReducer.getOutputKeyClass() == HFileKeyValue.class, String.format( "Reducer '%s' writing HFiles must output HFileKeyValue keys, but got '%s'", mReducerClass.getName(), mReducer.getOutputKeyClass().getName())); Preconditions.checkState( mReducer.getOutputValueClass() == NullWritable.class, String.format( "Reducer '%s' writing HFiles must output NullWritable values, but got '%s'", mReducerClass.getName(), mReducer.getOutputValueClass().getName())); } } /** {@inheritDoc} */ @Override protected Map<String, KeyValueStore<?, ?>> getRequiredStores() throws IOException { Map<String, KeyValueStore<?, ?>> requiredStores = new HashMap<String, KeyValueStore<?, ?>>(); Map<String, KeyValueStore<?, ?>> gathererStores = mGatherer.getRequiredStores(); if (null != gathererStores) { mergeStores(requiredStores, gathererStores); } if (null != mCombiner && mCombiner instanceof KeyValueStoreClient) { Map<String, KeyValueStore<?, ?>> combinerStores = ((KeyValueStoreClient) mCombiner).getRequiredStores(); if (null != combinerStores) { mergeStores(requiredStores, combinerStores); } } if (null != mReducer && mReducer instanceof KeyValueStoreClient) { Map<String, KeyValueStore<?, ?>> reducerStores = ((KeyValueStoreClient) mReducer).getRequiredStores(); if (null != reducerStores) { mergeStores(requiredStores, reducerStores); } } return requiredStores; } /** {@inheritDoc} */ @Override protected KijiMapReduceJob build(Job job) { return KijiMapReduceJob.create(job); } /** {@inheritDoc} */ @Override protected KijiDataRequest getDataRequest() { return mDataRequest; } /** {@inheritDoc} */ @Override protected KijiMapper<?, ?, ?, ?> getMapper() { return mMapper; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getCombiner() { return mCombiner; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getReducer() { return mReducer; } /** {@inheritDoc} */ @Override protected Class<?> getJarClass() { return mGathererClass; } /** {@inheritDoc} */ @Override protected void configureOutput(Job job) throws IOException { final MapReduceJobOutput output = getJobOutput(); if (null == output) { throw new JobConfigurationException("Must specify job output."); } final KijiReducer reducer = getReducer(); if (output instanceof HFileMapReduceJobOutput) { if (reducer instanceof IdentityReducer) { output.configure(job); } else { // Cannot use the HFile output format if the reducer is not IdentityReducer: // Writing HFile from a Kiji reducer requires an extra map/reduce to sort the HFile keys. // This forces the output format of this MapReduce job to be SequenceFile. final HFileMapReduceJobOutput hfileOutput = (HFileMapReduceJobOutput) output; LOG.warn("Reducing to HFiles will require an extra MapReduce job."); new HFileReducerMapReduceJobOutput(hfileOutput).configure(job); } } else { output.configure(job); } } }