/** * (c) Copyright 2013 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.pivot; import java.io.IOException; import java.util.Map; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.map.KijiMultithreadedMapper; import org.apache.hadoop.util.ReflectionUtils; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.mapreduce.JobConfigurationException; import org.kiji.mapreduce.KijiMapReduceJob; import org.kiji.mapreduce.KijiMapper; import org.kiji.mapreduce.KijiReducer; import org.kiji.mapreduce.MapReduceJobOutput; import org.kiji.mapreduce.framework.KijiConfKeys; import org.kiji.mapreduce.framework.KijiTableInputJobBuilder; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.output.KijiTableMapReduceJobOutput; import org.kiji.mapreduce.pivot.impl.PivoterMapper; import org.kiji.mapreduce.reducer.IdentityReducer; import org.kiji.schema.EntityId; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiRowData; /** * Builds jobs that run a {@link KijiPivoter} over a Kiji table. * * <p> * {@link KijiPivoter} scans the rows from an input KijiTable and writes cells * into an output KijiTable. The input and the output KijiTable may or may not be the same. * </p> * * <p> * Use the {@link KijiPivotJobBuilder} to configure a {@link KijiPivoter} job, by specifying: * <ul> * <li> the {@link KijiPivoter} class to run over the input KijiTable; </li> * <li> the input {@link org.kiji.schema.KijiTable} to be processed by the {@link KijiPivoter}; * </li> * <li> the output {@link org.kiji.schema.KijiTable} the {@link KijiPivoter} writes to. </li> * </ul> * </p> * * <p> Example: * <pre><blockquote> * final Configuration conf = ...; * final KijiURI inputTableURI = ...; * final KijiURI outputTableURI = ...; * final KijiMapReduceJob job = KijiPivotJobBuilder.create() * .withConf(conf) * .withPivoter(SomePivoter.class) * .withInputTable(inputTableURI) * .withOutput(MapReduceJobOutputs * .newHFileMapReduceJobOutput(outputTableURI, hfilePath)) * .build(); * job.run(); * </blockquote></pre> * </p> */ @ApiAudience.Public @ApiStability.Experimental public final class KijiPivotJobBuilder extends KijiTableInputJobBuilder<KijiPivotJobBuilder> { /** Default number of threads per mapper to use for running pivoters. */ private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1; /** {@link KijiPivoter} class to run over the table. */ private Class<? extends KijiPivoter> mPivoterClass; /** Configured number of threads per mapper to use for running pivoters. */ private int mNumThreadsPerMapper; /** Pivoter to run for this KijiMR pivot job. */ private KijiPivoter mPivoter; /** Hadoop mapper to run for this KijiMR pivot job. */ private KijiMapper<?, ?, ?, ?> mMapper; /** Hadoop reducer to run for this KijiMR pivot job. */ private KijiReducer<?, ?, ?, ?> mReducer; /** Specification of the data requested for this pivot job. */ private KijiDataRequest mDataRequest; /** Constructs a builder for jobs that run a Kiji table-mapper over a Kiji table. */ private KijiPivotJobBuilder() { mPivoterClass = null; mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER; mPivoter = null; mMapper = null; mReducer = null; mDataRequest = null; } /** * Creates a new builder for a {@link KijiPivoter} job. * * @return a new builder for a {@link KijiPivoter} job. */ public static KijiPivotJobBuilder create() { return new KijiPivotJobBuilder(); } /** * Configures the job with the {@link KijiPivoter} to run. * * @param pivoterClass {@link KijiPivoter} class to run over the input Kiji table. * @return this builder instance. */ public KijiPivotJobBuilder withPivoter( Class<? extends KijiPivoter> pivoterClass ) { mPivoterClass = pivoterClass; return this; } /** * Configures the output table of this pivoter. * * @param jobOutput Kiji table the pivoter writes to. * @return this builder instance. */ public KijiPivotJobBuilder withOutput(KijiTableMapReduceJobOutput jobOutput) { return super.withOutput(jobOutput); } /** * {@inheritDoc} * * <p> The output of a pivoter must be a KijiTable. </p> */ @Override public KijiPivotJobBuilder withOutput(MapReduceJobOutput jobOutput) { if (jobOutput instanceof KijiTableMapReduceJobOutput) { return withOutput((KijiTableMapReduceJobOutput) jobOutput); } else { throw new RuntimeException("KijiTableRWMapper must output to a Kiji table."); } } /** * Sets the number of threads to use for running the producer in parallel. * * <p>You may use this setting to run multiple instances of the pivoter in parallel * within each map task of the job. This may useful for increasing throughput when the * pivoter is not CPU bound.</p> * * @param numThreads Number of threads to use per mapper. * @return this build instance. */ public KijiPivotJobBuilder withNumThreads(int numThreads) { Preconditions.checkArgument(numThreads >= 1, "numThreads must be positive, got %d", numThreads); mNumThreadsPerMapper = numThreads; return this; } /** {@inheritDoc} */ @Override protected void configureJob(Job job) throws IOException { final Configuration conf = job.getConfiguration(); if (null == mPivoterClass) { throw new JobConfigurationException("Must specify a KijiPivoter class."); } // Serialize the pivoter class name into the job configuration. conf.setClass(KijiConfKeys.KIJI_PIVOTER_CLASS, mPivoterClass, KijiPivoter.class); // Producers should output to HFiles. mMapper = new PivoterMapper(); mReducer = new IdentityReducer<Object, Object>(); job.setJobName("KijiPivoter: " + mPivoterClass.getSimpleName()); mPivoter = ReflectionUtils.newInstance(mPivoterClass, job.getConfiguration()); mDataRequest = mPivoter.getDataRequest(); // Configure the table input job. super.configureJob(job); } /** {@inheritDoc} */ @Override protected void configureMapper(Job job) throws IOException { super.configureMapper(job); // Configure map-parallelism if configured. if (mNumThreadsPerMapper > 1) { @SuppressWarnings("unchecked") Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass = (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper.getClass(); KijiMultithreadedMapper.setMapperClass(job, childMapperClass); KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper); job.setMapperClass(KijiMultithreadedMapper.class); } } /** {@inheritDoc} */ @Override protected Map<String, KeyValueStore<?, ?>> getRequiredStores() throws IOException { return mPivoter.getRequiredStores(); } /** {@inheritDoc} */ @Override protected KijiMapReduceJob build(Job job) { return KijiMapReduceJob.create(job); } /** {@inheritDoc} */ @Override protected KijiDataRequest getDataRequest() { return mDataRequest; } /** {@inheritDoc} */ @Override protected KijiMapper<?, ?, ?, ?> getMapper() { return mMapper; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getCombiner() { // A pivoter cannot have combiners. return null; } /** {@inheritDoc} */ @Override protected KijiReducer<?, ?, ?, ?> getReducer() { return mReducer; } /** {@inheritDoc} */ @Override protected Class<?> getJarClass() { return mPivoterClass; } }