/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.produce; import java.io.IOException; import java.util.Map; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.map.KijiMultithreadedMapper; import org.apache.hadoop.util.ReflectionUtils; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.mapreduce.JobConfigurationException; import org.kiji.mapreduce.KijiMapReduceJob; import org.kiji.mapreduce.KijiMapper; import org.kiji.mapreduce.KijiReducer; import org.kiji.mapreduce.MapReduceJobOutput; import org.kiji.mapreduce.framework.KijiConfKeys; import org.kiji.mapreduce.framework.KijiTableInputJobBuilder; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.output.KijiTableMapReduceJobOutput; import org.kiji.mapreduce.produce.impl.KijiProducers; import org.kiji.mapreduce.produce.impl.ProduceMapper; import org.kiji.mapreduce.reducer.IdentityReducer; import org.kiji.schema.EntityId; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiRowData; import org.kiji.schema.KijiTable; /** Builds jobs that run a producer over a Kiji table. */ @ApiAudience.Public @ApiStability.Stable public final class KijiProduceJobBuilder extends KijiTableInputJobBuilder<KijiProduceJobBuilder> { /** The default number of threads per mapper to use for running producers. */ private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1; /** The class of the producer to run. */ private Class<? extends KijiProducer> mProducerClass; /** The number of threads per mapper to use for running producers. */ private int mNumThreadsPerMapper; /** Producer job output. */ private KijiTableMapReduceJobOutput mJobOutput; /** The producer instance. */ private KijiProducer mProducer; /** The mapper instance. */ private KijiMapper<?, ?, ?, ?> mMapper; /** The reducer instance. */ private KijiReducer<?, ?, ?, ?> mReducer; /** The data request for the job's table input. */ private KijiDataRequest mDataRequest; /** Constructs a builder for jobs that run a Kiji producer over a Kiji table. */ private KijiProduceJobBuilder() { mProducerClass = null; mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER; mJobOutput = null; mProducer = null; mMapper = null; mReducer = null; mDataRequest = null; } /** * Creates a new builder for Kiji produce jobs. * * @return a new Kiji produce job builder. */ public static KijiProduceJobBuilder create() { return new KijiProduceJobBuilder(); } /** * Configures the job with the Kiji producer to run. * * @param producerClass The producer class. * @return This builder instance so you may chain configuration method calls. */ public KijiProduceJobBuilder withProducer(Class<? extends KijiProducer> producerClass) { mProducerClass = producerClass; return this; } /** * Configures the producer output. * * @param jobOutput Output table of the producer must match the input table. * @return this builder instance so you may chain configuration method calls. */ public KijiProduceJobBuilder withOutput(KijiTableMapReduceJobOutput jobOutput) { mJobOutput = jobOutput; return super.withOutput(jobOutput); } /** * {@inheritDoc} * * @param jobOutput Output table of the producer must match the input table. Must be an instance * of KijiTableMapReduceJobOutput or a subclass. */ @Override public KijiProduceJobBuilder withOutput(MapReduceJobOutput jobOutput) { if (jobOutput instanceof KijiTableMapReduceJobOutput) { return withOutput((KijiTableMapReduceJobOutput) jobOutput); } else { // Throw a more helpful debugging message. throw new RuntimeException("jobOutput parameter of KijiProduceJobBuilder.withOutput() must " + "be a KijiTableMapReduceJobOutput."); } } /** * Sets the number of threads to use for running the producer in parallel. * * <p>You may use this setting to run multiple instances of your producer in parallel * within each map task of the job. This may useful for increasing throughput when your * producer is not CPU bound.</p> * * @param numThreads The number of produce-runner threads to use per mapper. * @return This build instance so you may chain configuration method calls. */ public KijiProduceJobBuilder withNumThreads(int numThreads) { Preconditions.checkArgument(numThreads >= 1, "numThreads must be positive, got %d", numThreads); mNumThreadsPerMapper = numThreads; return this; } /** {@inheritDoc} */ @Override protected void configureJob(Job job) throws IOException { final Configuration conf = job.getConfiguration(); // Construct the producer instance. if (null == mProducerClass) { throw new JobConfigurationException("Must specify a producer."); } // Serialize the producer class name into the job configuration. conf.setClass(KijiConfKeys.KIJI_PRODUCER_CLASS, mProducerClass, KijiProducer.class); // Write to the table, but make sure the output table is the same as the input table. if (!getInputTableURI().equals(mJobOutput.getOutputTableURI())) { throw new JobConfigurationException("Output table must be the same as the input table."); } // Producers should output to HFiles. mMapper = new ProduceMapper(); mReducer = new IdentityReducer<Object, Object>(); job.setJobName("Kiji produce: " + mProducerClass.getSimpleName()); mProducer = ReflectionUtils.newInstance(mProducerClass, job.getConfiguration()); mDataRequest = mProducer.getDataRequest(); // Configure the table input job. super.configureJob(job); } /** {@inheritDoc} */ @Override protected void configureMapper(Job job) throws IOException { super.configureMapper(job); // Configure map-parallelism if configured. if (mNumThreadsPerMapper > 1) { @SuppressWarnings("unchecked") Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass = (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper.getClass(); KijiMultithreadedMapper.setMapperClass(job, childMapperClass); KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper); job.setMapperClass(KijiMultithreadedMapper.class); } } /** {@inheritDoc} */ @Override protected Map<String, KeyValueStore<?, ?>> getRequiredStores() throws IOException { return mProducer.getRequiredStores(); } /** {@inheritDoc} */ @Override protected KijiMapReduceJob build(Job job) { return KijiMapReduceJob.create(job); } /** {@inheritDoc} */ @Override protected KijiDataRequest getDataRequest() { return mDataRequest; } /** {@inheritDoc} */ @Override protected void validateInputTable(KijiTable inputTable) throws IOException { // Validate the Kiji data request against the input table layout: super.validateInputTable(inputTable); // Validate the producer output column against the output table (ie. the input table): KijiProducers.validateOutputColumn(mProducer, inputTable.getLayout()); } /** {@inheritDoc} */ @Override protected KijiMapper<?, ?, ?, ?> getMapper() { return mMapper; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getCombiner() { // Producers can't have combiners. return null; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getReducer() { return mReducer; } /** {@inheritDoc} */ @Override protected Class<?> getJarClass() { return mProducerClass; } }