/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mrunit; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mrunit.types.Pair; /** * Harness that allows you to test a dataflow through a set of Mappers and * Reducers. You provide a set of (Mapper, Reducer) "jobs" that make up * a workflow, as well as a set of (key, value) pairs to pass in to the first * Mapper. You can also specify the outputs you expect to be sent to the final * Reducer in the pipeline. * * By calling runTest(), the harness will deliver the input to the first * Mapper, feed the intermediate results to the first Reducer (without checking * them), and proceed to forward this data along to subsequent Mapper/Reducer * jobs in the pipeline until the final Reducer. The last Reducer's outputs are * checked against the expected results. * * This is designed for slightly more complicated integration tests than the * MapReduceDriver, which is for smaller unit tests. * * (K1, V1) in the type signature refer to the types associated with the inputs * to the first Mapper. (K2, V2) refer to the types associated with the final * Reducer's output. No intermediate types are specified. */ public class PipelineMapReduceDriver<K1, V1, K2, V2> extends TestDriver<K1, V1, K2, V2> { public static final Log LOG = LogFactory.getLog(PipelineMapReduceDriver.class); private List<Pair<Mapper, Reducer>> mapReducePipeline; private List<Pair<K1, V1>> inputList; private Counters counters; public PipelineMapReduceDriver(final List<Pair<Mapper, Reducer>> pipeline) { this.mapReducePipeline = copyMapReduceList(pipeline); this.inputList = new ArrayList<Pair<K1, V1>>(); this.counters = new Counters(); } public PipelineMapReduceDriver() { this.mapReducePipeline = new ArrayList<Pair<Mapper, Reducer>>(); this.inputList = new ArrayList<Pair<K1, V1>>(); this.counters = new Counters(); } private List<Pair<Mapper, Reducer>> copyMapReduceList(List<Pair<Mapper, Reducer>> lst) { List<Pair<Mapper, Reducer>> outList = new ArrayList<Pair<Mapper, Reducer>>(); for (Pair<Mapper, Reducer> p : lst) { // Take advantage of the fact that Pair is immutable. outList.add(p); } return outList; } /** @return the counters used in this test */ public Counters getCounters() { return counters; } /** Sets the counters object to use for this test. * @param ctrs The counters object to use. */ public void setCounters(final Counters ctrs) { this.counters = ctrs; } /** Sets the counters to use and returns self for fluent style */ public PipelineMapReduceDriver<K1, V1, K2, V2> withCounters(final Counters ctrs) { setCounters(ctrs); return this; } /** Add a Mapper and Reducer instance to the pipeline to use with this test driver * @param m The Mapper instance to add to the pipeline * @param r The Reducer instance to add to the pipeline */ public void addMapReduce(Mapper m, Reducer r) { Pair<Mapper, Reducer> p = new Pair<Mapper, Reducer>(m, r); this.mapReducePipeline.add(p); } /** Add a Mapper and Reducer instance to the pipeline to use with this test driver * @param p The Mapper and Reducer instances to add to the pipeline */ public void addMapReduce(Pair<Mapper, Reducer> p) { this.mapReducePipeline.add(p); } /** Add a Mapper and Reducer instance to the pipeline to use with this test driver * using fluent style * @param m The Mapper instance to use * @param r The Reducer instance to use */ public PipelineMapReduceDriver<K1, V1, K2, V2> withMapReduce(Mapper m, Reducer r) { addMapReduce(m, r); return this; } /** Add a Mapper and Reducer instance to the pipeline to use with this test driver * using fluent style * @param p The Mapper and Reducer instances to add to the pipeline */ public PipelineMapReduceDriver<K1, V1, K2, V2> withMapReduce(Pair<Mapper, Reducer> p) { addMapReduce(p); return this; } /** * @return A copy of the list of Mapper and Reducer objects under test */ public List<Pair<Mapper, Reducer>> getMapReducePipeline() { return copyMapReduceList(this.mapReducePipeline); } /** * Adds an input to send to the mapper * @param key * @param val */ public void addInput(K1 key, V1 val) { inputList.add(new Pair<K1, V1>(key, val)); } /** * Identical to addInput() but returns self for fluent programming style * @param key * @param val * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withInput(K1 key, V1 val) { addInput(key, val); return this; } /** * Adds an input to send to the Mapper * @param input The (k, v) pair to add to the input list. */ public void addInput(Pair<K1, V1> input) { if (null == input) { throw new IllegalArgumentException("Null input in addInput()"); } inputList.add(input); } /** * Identical to addInput() but returns self for fluent programming style * @param input The (k, v) pair to add * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withInput( Pair<K1, V1> input) { addInput(input); return this; } /** * Adds an output (k, v) pair we expect from the Reducer * @param outputRecord The (k, v) pair to add */ public void addOutput(Pair<K2, V2> outputRecord) { if (null != outputRecord) { expectedOutputs.add(outputRecord); } else { throw new IllegalArgumentException("Tried to add null outputRecord"); } } /** * Works like addOutput(), but returns self for fluent style * @param outputRecord * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withOutput( Pair<K2, V2> outputRecord) { addOutput(outputRecord); return this; } /** * Adds a (k, v) pair we expect as output from the Reducer * @param key * @param val */ public void addOutput(K2 key, V2 val) { addOutput(new Pair<K2, V2>(key, val)); } /** * Functions like addOutput() but returns self for fluent programming style * @param key * @param val * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withOutput(K2 key, V2 val) { addOutput(key, val); return this; } /** * Expects an input of the form "key \t val" * Forces the Mapper input types to Text. * @param input A string of the form "key \t val". Trims any whitespace. */ public void addInputFromString(String input) { if (null == input) { throw new IllegalArgumentException("null input given to setInput"); } else { Pair<Text, Text> inputPair = parseTabbedPair(input); if (null != inputPair) { // I know this is not type-safe, but I don't // know a better way to do this. addInput((Pair<K1, V1>) inputPair); } else { throw new IllegalArgumentException("Could not parse input pair in addInput"); } } } /** * Identical to addInputFromString, but with a fluent programming style * @param input A string of the form "key \t val". Trims any whitespace. * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withInputFromString(String input) { addInputFromString(input); return this; } /** * Expects an input of the form "key \t val" * Forces the Reducer output types to Text. * @param output A string of the form "key \t val". Trims any whitespace. */ public void addOutputFromString(String output) { if (null == output) { throw new IllegalArgumentException("null input given to setOutput"); } else { Pair<Text, Text> outputPair = parseTabbedPair(output); if (null != outputPair) { // I know this is not type-safe, // but I don't know a better way to do this. addOutput((Pair<K2, V2>) outputPair); } else { throw new IllegalArgumentException( "Could not parse output pair in setOutput"); } } } /** * Identical to addOutputFromString, but with a fluent programming style * @param output A string of the form "key \t val". Trims any whitespace. * @return this */ public PipelineMapReduceDriver<K1, V1, K2, V2> withOutputFromString(String output) { addOutputFromString(output); return this; } public List<Pair<K2, V2>> run() throws IOException { // inputs starts with the user-provided inputs. List inputs = this.inputList; if (mapReducePipeline.size() == 0) { LOG.warn("No Mapper or Reducer instances in pipeline; this is a trivial test."); } if (inputs.size() == 0) { LOG.warn("No inputs configured to send to MapReduce pipeline; this is a trivial test."); } for (Pair<Mapper, Reducer> job : mapReducePipeline) { // Create a MapReduceDriver to run this phase of the pipeline. MapReduceDriver mrDriver = new MapReduceDriver(job.getFirst(), job.getSecond()); mrDriver.setCounters(getCounters()); // Add the inputs from the user, or from the previous stage of the pipeline. for (Object input : inputs) { mrDriver.addInput((Pair) input); } // Run the MapReduce "job". The output of this job becomes // the input to the next job. inputs = mrDriver.run(); } // The last list of values stored in "inputs" is actually the outputs. // Unfortunately, due to the variable-length list of MR passes the user // can test, this is not type-safe. return (List<Pair<K2, V2>>) inputs; } @Override public void runTest() throws RuntimeException { List<Pair<K2, V2>> outputs = null; boolean succeeded; try { outputs = run(); validate(outputs); } catch (IOException ioe) { LOG.error("IOException: " + ioe.toString()); LOG.debug("Setting success to false based on IOException"); throw new RuntimeException(); } } }