/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.hadoop.mapreduce;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.mockito.Mockito.mock;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.pentaho.di.core.KettleEnvironment;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.LogLevel;
import org.pentaho.di.trans.TransMeta;
/**
* @author Tatsiana_Kasiankova
*
*/
@SuppressWarnings( { "unchecked", "rawtypes" } )
public class PentahoMapReduceIT {
//Turn off debug messages for the tests.
private static final boolean DEBUG_MODE = false;
private static final String WORDS_TO_CALCULATE = "zebra giraffe hippo elephant tiger";
private Reporter reporterMock = mock( Reporter.class );
private PentahoMapRunnable mapRunnable;
private JobConf mrJobConfig;
private TransMeta transMeta;
private MockOutputCollector outputCollectorMock = new MockOutputCollector();
private MockOutputCollector reducerInputCollectorMock = outputCollectorMock;
private MockRecordReader reader;
private GenericTransReduce genericTransReduce;
private static final int ROWS_TO_CALCULATE = 10000;
@BeforeClass
public static void before() throws KettleException {
KettleEnvironment.init();
}
@Before
public void setUp() throws KettleException, IOException {
mapRunnable = new PentahoMapRunnable();
genericTransReduce = new GenericTransReduce();
mrJobConfig = new JobConf();
//Turn off all debug messages from PentahoMapRunnable to reduce unit test logs.Turn it on if it needs for debug.
mrJobConfig.set( "logLevel", LogLevel.ERROR.name() );
}
@Test
public void testMapRunnable_WordCount() throws IOException, KettleException, URISyntaxException {
// Create mapper transformations and configure job with the appropriate settings
transMeta = new TransMeta( getClass().getResource( MRTestUtil.PATH_TO_WORDCOUNT_MAPPER_TEST_TRANSFORMATION ).toURI().getPath() );
MRTestUtil.configJobMapBaseCase( transMeta, mrJobConfig, mapRunnable );
// Create reducer transformations and configure job with the appropriate settings
transMeta = new TransMeta( getClass().getResource( MRTestUtil.PATH_TO_WORDCOUNT_REDUCER_TEST_TRANSFORMATION ).toURI().getPath() );
MRTestUtil.configJobReducerBaseCase( transMeta, mrJobConfig, genericTransReduce );
// Create data rows with words to count
List<String> wordsToCalculate = IntStream.rangeClosed( 1, ROWS_TO_CALCULATE ).mapToObj( value -> String.valueOf( WORDS_TO_CALCULATE ) ).collect( Collectors.toList() );
if ( DEBUG_MODE ) {
System.out.println( "Mapper input data: " + ROWS_TO_CALCULATE + " rows of [" + WORDS_TO_CALCULATE + "]" );
}
reader = new MockRecordReader( wordsToCalculate );
// execute mapper
long start = System.currentTimeMillis();
mapRunnable.run( reader, outputCollectorMock, reporterMock );
outputCollectorMock.close();
long stop = System.currentTimeMillis();
if ( DEBUG_MODE ) {
System.out.println( "Executed " + ROWS_TO_CALCULATE + " in " + ( stop - start ) + "ms" );
System.out.println( "Average: " + ( ( stop - start ) / (float) ROWS_TO_CALCULATE ) + "ms" );
System.out.println( "Rows/Second: " + ( ROWS_TO_CALCULATE / ( ( stop - start ) / 1000f ) ) );
}
if ( DEBUG_MODE ) {
outputCollectorMock.getCollection().forEach( ( k, v ) -> System.out.println( "Mapper output data: " + k + "=" + v ) );
}
assertNull( "Exception thrown", mapRunnable.getException() );
assertNotNull( outputCollectorMock );
assertNotNull( outputCollectorMock.getCollection() );
assertNotNull( outputCollectorMock.getCollection().keySet() );
assertEquals( 5, outputCollectorMock.getCollection().keySet().size() );
// verifying the arrays of word count for the each word
List<IntWritable> expectedWordCountArrays = IntStream.rangeClosed( 1, ROWS_TO_CALCULATE ).mapToObj( value -> new IntWritable( 1 ) ).collect( Collectors.toList() );
for ( Object key : outputCollectorMock.getCollection().keySet() ) {
assertEquals( "Incorrect count array for the word: " + key, expectedWordCountArrays, outputCollectorMock.getCollection().get( new Text( key.toString() ) ) );
}
// input data for reducer is going to be taken from mapper output data
reducerInputCollectorMock = outputCollectorMock;
if ( DEBUG_MODE ) {
reducerInputCollectorMock.getCollection().forEach( ( k, v ) -> System.out.println( "Reducer input data: " + k + "=" + v ) );
}
outputCollectorMock = new MockOutputCollector();
// execute reducer
start = System.currentTimeMillis();
for ( Object key : reducerInputCollectorMock.getCollection().keySet() ) {
genericTransReduce.reduce( (Text) key, new ArrayList( reducerInputCollectorMock.getCollection().get( key ) ).iterator(), outputCollectorMock, reporterMock );
genericTransReduce.close();
}
outputCollectorMock.close();
stop = System.currentTimeMillis();
if ( DEBUG_MODE ) {
outputCollectorMock.getCollection().forEach( ( k, v ) -> System.out.println( "Reducer output data: " + k + "=" + v ) );
}
// verifying reduced data
assertNull( "Exception thrown", genericTransReduce.getException() );
assertNotNull( outputCollectorMock );
assertNotNull( outputCollectorMock.getCollection() );
assertNotNull( outputCollectorMock.getCollection().keySet() );
assertEquals( 5, outputCollectorMock.getCollection().keySet().size() );
IntWritable expectedWordCount = new IntWritable( expectedWordCountArrays.stream().mapToInt( IntWritable::get ).sum() );
for ( String wordToCount : Arrays.asList( WORDS_TO_CALCULATE.split( " " ) ) ) {
assertEquals( expectedWordCount, outputCollectorMock.getCollection().get( new Text( wordToCount ) ).get( 0 ) );
}
}
}