/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.memgroupby;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.ArgumentCaptor;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.KettleClientEnvironment;
import org.pentaho.di.core.RowMetaAndData;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaDate;
import org.pentaho.di.core.row.value.ValueMetaInteger;
import org.pentaho.di.core.row.value.ValueMetaNumber;
import org.pentaho.di.core.row.value.ValueMetaString;
import org.pentaho.di.core.variables.Variables;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.StepMeta;
import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.DiscreteDomain;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Range;
import com.google.common.collect.TreeBasedTable;
/**
* @author nhudak
*/
@RunWith( org.mockito.runners.MockitoJUnitRunner.class )
public class MemoryGroupByAggregationTest {
private Variables variables;
private Map<String, Integer> aggregates;
public static final String STEP_NAME = "testStep";
private static final ImmutableMap<String, Integer> default_aggregates;
static {
default_aggregates = ImmutableMap.<String, Integer>builder()
.put( "min", MemoryGroupByMeta.TYPE_GROUP_MIN )
.put( "max", MemoryGroupByMeta.TYPE_GROUP_MAX )
.put( "sum", MemoryGroupByMeta.TYPE_GROUP_SUM )
.put( "ave", MemoryGroupByMeta.TYPE_GROUP_AVERAGE )
.put( "count", MemoryGroupByMeta.TYPE_GROUP_COUNT_ALL )
.put( "count_any", MemoryGroupByMeta.TYPE_GROUP_COUNT_ANY )
.put( "count_distinct", MemoryGroupByMeta.TYPE_GROUP_COUNT_DISTINCT )
.build();
}
private RowMeta rowMeta;
private TreeBasedTable<Integer, Integer, Optional<Object>> data;
@BeforeClass
public static void setUpBeforeClass() throws KettleException {
KettleClientEnvironment.init();
}
@Before
public void setUp() throws Exception {
rowMeta = new RowMeta();
data = TreeBasedTable.create();
variables = new Variables();
aggregates = Maps.newHashMap( default_aggregates );
}
@Test
public void testDefault() throws Exception {
addColumn( new ValueMetaInteger( "intg" ), 0L, 1L, 1L, 10L );
addColumn( new ValueMetaInteger( "nul" ) );
addColumn( new ValueMetaInteger( "mix1" ), -1L, 2L );
addColumn( new ValueMetaInteger( "mix2" ), null, 7L );
addColumn( new ValueMetaNumber( "mix3" ), -1.0, 2.5 );
addColumn( new ValueMetaDate( "date1" ), new Date( 1L ), new Date( 2L ) );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "intg_min" ), is( 0L ) );
assertThat( output.getInteger( "intg_max" ), is( 10L ) );
assertThat( output.getInteger( "intg_sum" ), is( 12L ) );
assertThat( output.getInteger( "intg_ave" ), is( 3L ) );
assertThat( output.getInteger( "intg_count" ), is( 4L ) );
assertThat( output.getInteger( "intg_count_any" ), is( 4L ) );
assertThat( output.getInteger( "intg_count_distinct" ), is( 3L ) );
assertThat( output.getInteger( "nul_min" ), nullValue() );
assertThat( output.getInteger( "nul_max" ), nullValue() );
assertThat( output.getInteger( "nul_sum" ), nullValue() );
assertThat( output.getInteger( "nul_ave" ), nullValue() );
assertThat( output.getInteger( "nul_count" ), is( 0L ) );
assertThat( output.getInteger( "nul_count_any" ), is( 4L ) );
assertThat( output.getInteger( "nul_count_distinct" ), is( 0L ) );
assertThat( output.getInteger( "mix1_max" ), is( 2L ) );
assertThat( output.getInteger( "mix1_min" ), is( -1L ) );
assertThat( output.getInteger( "mix1_sum" ), is( 1L ) );
assertThat( output.getInteger( "mix1_ave" ), is( 0L ) );
assertThat( output.getInteger( "mix1_count" ), is( 2L ) );
assertThat( output.getInteger( "mix1_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix1_count_distinct" ), is( 2L ) );
assertThat( output.getInteger( "mix2_max" ), is( 7L ) );
assertThat( output.getInteger( "mix2_min" ), is( 7L ) );
assertThat( output.getInteger( "mix2_sum" ), is( 7L ) );
assertThat( output.getNumber( "mix2_ave", Double.NaN ), is( 7.0 ) );
assertThat( output.getInteger( "mix2_count" ), is( 1L ) );
assertThat( output.getInteger( "mix2_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix2_count_distinct" ), is( 1L ) );
assertThat( output.getNumber( "mix3_max", Double.NaN ), is( 2.5 ) );
assertThat( output.getNumber( "mix3_min", Double.NaN ), is( -1.0 ) );
assertThat( output.getNumber( "mix3_sum", Double.NaN ), is( 1.5 ) );
assertThat( output.getNumber( "mix3_ave", Double.NaN ), is( 0.75 ) );
assertThat( output.getInteger( "mix3_count" ), is( 2L ) );
assertThat( output.getInteger( "mix3_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix3_count_distinct" ), is( 2L ) );
assertThat( output.getNumber( "date1_min", Double.NaN ), is( 1.0 ) );
assertThat( output.getNumber( "date1_max", Double.NaN ), is( 2.0 ) );
assertThat( output.getNumber( "date1_sum", Double.NaN ), is( 3.0 ) );
assertThat( output.getNumber( "date1_ave", Double.NaN ), is( 1.5 ) );
assertThat( output.getInteger( "date1_count" ), is( 2L ) );
assertThat( output.getInteger( "date1_count_any" ), is( 4L ) );
assertThat( output.getInteger( "date1_count_distinct" ), is( 2L ) );
}
@Test
public void testCompatibility() throws KettleException {
variables.setVariable( Const.KETTLE_COMPATIBILITY_MEMORY_GROUP_BY_SUM_AVERAGE_RETURN_NUMBER_TYPE, "Y" );
addColumn( new ValueMetaInteger( "intg" ), 0L, 1L, 1L, 10L );
addColumn( new ValueMetaInteger( "nul" ) );
addColumn( new ValueMetaInteger( "mix1" ), -1L, 2L );
addColumn( new ValueMetaInteger( "mix2" ), null, 7L );
addColumn( new ValueMetaNumber( "mix3" ), -1.0, 2.5 );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "intg_min" ), is( 0L ) );
assertThat( output.getInteger( "intg_max" ), is( 10L ) );
assertThat( output.getInteger( "intg_sum" ), is( 12L ) );
assertThat( output.getInteger( "intg_ave" ), is( 3L ) );
assertThat( output.getInteger( "intg_count" ), is( 4L ) );
assertThat( output.getInteger( "intg_count_any" ), is( 4L ) );
assertThat( output.getInteger( "intg_count_distinct" ), is( 3L ) );
assertThat( output.getInteger( "nul_min" ), nullValue() );
assertThat( output.getInteger( "nul_max" ), nullValue() );
assertThat( output.getInteger( "nul_sum" ), nullValue() );
assertThat( output.getInteger( "nul_ave" ), nullValue() );
assertThat( output.getInteger( "nul_count" ), is( 0L ) );
assertThat( output.getInteger( "nul_count_any" ), is( 4L ) );
assertThat( output.getInteger( "nul_count_distinct" ), is( 0L ) );
assertThat( output.getInteger( "mix1_max" ), is( 2L ) );
assertThat( output.getInteger( "mix1_min" ), is( -1L ) );
assertThat( output.getInteger( "mix1_sum" ), is( 1L ) );
assertThat( output.getNumber( "mix1_ave", Double.NaN ), is( 0.5 ) );
assertThat( output.getInteger( "mix1_count" ), is( 2L ) );
assertThat( output.getInteger( "mix1_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix1_count_distinct" ), is( 2L ) );
assertThat( output.getInteger( "mix2_max" ), is( 7L ) );
assertThat( output.getInteger( "mix2_min" ), is( 7L ) );
assertThat( output.getInteger( "mix2_sum" ), is( 7L ) );
assertThat( output.getNumber( "mix2_ave", Double.NaN ), is( 7.0 ) );
assertThat( output.getInteger( "mix2_count" ), is( 1L ) );
assertThat( output.getInteger( "mix2_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix2_count_distinct" ), is( 1L ) );
assertThat( output.getNumber( "mix3_max", Double.NaN ), is( 2.5 ) );
assertThat( output.getNumber( "mix3_min", Double.NaN ), is( -1.0 ) );
assertThat( output.getNumber( "mix3_sum", Double.NaN ), is( 1.5 ) );
assertThat( output.getNumber( "mix3_ave", Double.NaN ), is( 0.75 ) );
assertThat( output.getInteger( "mix3_count" ), is( 2L ) );
assertThat( output.getInteger( "mix3_count_any" ), is( 4L ) );
assertThat( output.getInteger( "mix3_count_distinct" ), is( 2L ) );
}
@Test
public void testNullMin() throws Exception {
variables.setVariable( Const.KETTLE_AGGREGATION_MIN_NULL_IS_VALUED, "Y" );
addColumn( new ValueMetaInteger( "intg" ), null, 0L, 1L, -1L );
addColumn( new ValueMetaString( "str" ), "A", null, "B", null );
aggregates = Maps.toMap( ImmutableList.of( "min", "max" ), Functions.forMap( default_aggregates ) );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "intg_min" ), nullValue() );
assertThat( output.getInteger( "intg_max" ), is( 1L ) );
assertThat( output.getString( "str_min", null ), nullValue() );
assertThat( output.getString( "str_max", "invalid" ), is( "B" ) );
}
@Test
public void testNullsAreZeroCompatible() throws Exception {
variables.setVariable( Const.KETTLE_AGGREGATION_ALL_NULLS_ARE_ZERO, "Y" );
variables.setVariable( Const.KETTLE_COMPATIBILITY_MEMORY_GROUP_BY_SUM_AVERAGE_RETURN_NUMBER_TYPE, "Y" );
addColumn( new ValueMetaInteger( "nul" ) );
addColumn( new ValueMetaInteger( "both" ), -2L, 0L, null, 10L );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "nul_min" ), is( 0L ) );
assertThat( output.getInteger( "nul_max" ), is( 0L ) );
assertThat( output.getInteger( "nul_sum" ), is( 0L ) );
assertThat( output.getInteger( "nul_ave" ), is( 0L ) );
assertThat( output.getInteger( "nul_count" ), is( 0L ) );
assertThat( output.getInteger( "nul_count_any" ), is( 4L ) );
assertThat( output.getInteger( "nul_count_distinct" ), is( 0L ) );
assertThat( output.getInteger( "both_max" ), is( 10L ) );
assertThat( output.getInteger( "both_min" ), is( -2L ) );
assertThat( output.getInteger( "both_sum" ), is( 8L ) );
assertThat( output.getInteger( "both_ave" ), is( 3L ) );
assertThat( output.getInteger( "both_count" ), is( 3L ) );
assertThat( output.getInteger( "both_count_any" ), is( 4L ) );
assertThat( output.getInteger( "both_count_distinct" ), is( 3L ) );
}
@Test
public void testNullsAreZeroDefault() throws Exception {
variables.setVariable( Const.KETTLE_AGGREGATION_ALL_NULLS_ARE_ZERO, "Y" );
addColumn( new ValueMetaInteger( "nul" ) );
addColumn( new ValueMetaInteger( "both" ), -2L, 0L, null, 10L );
addColumn( new ValueMetaNumber( "both_num" ), -2.0, 0.0, null, 10.0 );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "nul_min" ), is( 0L ) );
assertThat( output.getInteger( "nul_max" ), is( 0L ) );
assertThat( output.getInteger( "nul_sum" ), is( 0L ) );
assertThat( output.getInteger( "nul_ave" ), is( 0L ) );
assertThat( output.getInteger( "nul_count" ), is( 0L ) );
assertThat( output.getInteger( "nul_count_any" ), is( 4L ) );
assertThat( output.getInteger( "nul_count_distinct" ), is( 0L ) );
assertThat( output.getInteger( "both_max" ), is( 10L ) );
assertThat( output.getInteger( "both_min" ), is( -2L ) );
assertThat( output.getInteger( "both_sum" ), is( 8L ) );
assertThat( output.getInteger( "both_ave" ), is( 2L ) );
assertThat( output.getInteger( "both_count" ), is( 3L ) );
assertThat( output.getInteger( "both_count_any" ), is( 4L ) );
assertThat( output.getInteger( "both_count_distinct" ), is( 3L ) );
assertThat( output.getNumber( "both_num_max", Double.NaN ), is( 10.0 ) );
assertThat( output.getNumber( "both_num_min", Double.NaN ), is( -2.0 ) );
assertThat( output.getNumber( "both_num_sum", Double.NaN ), is( 8.0 ) );
assertEquals( 2.666666, output.getNumber( "both_num_ave", Double.NaN ), 0.000001 /* delta */ );
assertThat( output.getInteger( "both_num_count" ), is( 3L ) );
assertThat( output.getInteger( "both_num_count_any" ), is( 4L ) );
assertThat( output.getInteger( "both_num_count_distinct" ), is( 3L ) );
}
@Test
public void testSQLCompatible() throws Exception {
addColumn( new ValueMetaInteger( "value" ), null, -2L, null, 0L, null, 10L, null, null, 0L, null );
RowMetaAndData output = runStep();
assertThat( output.getInteger( "value_max" ), is( 10L ) );
assertThat( output.getInteger( "value_min" ), is( -2L ) );
assertThat( output.getInteger( "value_sum" ), is( 8L ) );
assertThat( output.getInteger( "value_ave" ), is( 2L ) );
assertThat( output.getInteger( "value_count" ), is( 4L ) );
assertThat( output.getInteger( "value_count_any" ), is( 10L ) );
assertThat( output.getInteger( "value_count_distinct" ), is( 3L ) );
}
private RowMetaAndData runStep() throws KettleException {
// Allocate meta
List<String> aggKeys = ImmutableList.copyOf( aggregates.keySet() );
MemoryGroupByMeta meta = new MemoryGroupByMeta();
meta.allocate( 0, rowMeta.size() * aggKeys.size() );
for ( int i = 0; i < rowMeta.size(); i++ ) {
String name = rowMeta.getValueMeta( i ).getName();
for ( int j = 0; j < aggKeys.size(); j++ ) {
String aggKey = aggKeys.get( j );
int index = i * aggKeys.size() + j;
meta.getAggregateField()[index] = name + "_" + aggKey;
meta.getSubjectField()[index] = name;
meta.getAggregateType()[index] = aggregates.get( aggKey );
}
}
MemoryGroupByData data = new MemoryGroupByData();
data.map = Maps.newHashMap();
// Add to trans
TransMeta transMeta = mock( TransMeta.class );
StepMeta stepMeta = new StepMeta( STEP_NAME, meta );
when( transMeta.findStep( STEP_NAME ) ).thenReturn( stepMeta );
// Spy on step, regrettable but we need to easily inject rows
MemoryGroupBy step = spy( new MemoryGroupBy( stepMeta, data, 0, transMeta, mock( Trans.class ) ) );
step.copyVariablesFrom( variables );
doNothing().when( step ).putRow( (RowMetaInterface) any(), (Object[]) any() );
doNothing().when( step ).setOutputDone();
// Process rows
doReturn( rowMeta ).when( step ).getInputRowMeta();
for ( Object[] row : getRows() ) {
doReturn( row ).when( step ).getRow();
assertThat( step.processRow( meta, data ), is( true ) );
}
verify( step, never() ).putRow( (RowMetaInterface) any(), (Object[]) any() );
// Mark stop
doReturn( null ).when( step ).getRow();
assertThat( step.processRow( meta, data ), is( false ) );
verify( step ).setOutputDone();
// Collect output
ArgumentCaptor<RowMetaInterface> rowMetaCaptor = ArgumentCaptor.forClass( RowMetaInterface.class );
ArgumentCaptor<Object[]> rowCaptor = ArgumentCaptor.forClass( Object[].class );
verify( step ).putRow( rowMetaCaptor.capture(), rowCaptor.capture() );
return new RowMetaAndData( rowMetaCaptor.getValue(), rowCaptor.getValue() );
}
private void addColumn( ValueMetaInterface meta, Object... values ) {
int column = rowMeta.size();
rowMeta.addValueMeta( meta );
for ( int row = 0; row < values.length; row++ ) {
data.put( row, column, Optional.fromNullable( values[row] ) );
}
}
private Iterable<Object[]> getRows() {
if ( data.isEmpty() ) {
return ImmutableSet.of();
}
Range<Integer> rows = Range.closed( 0, data.rowMap().lastKey() );
return FluentIterable.from( ContiguousSet.create( rows, DiscreteDomain.integers() ) )
.transform( Functions.forMap( data.rowMap(), ImmutableMap.<Integer, Optional<Object>>of() ) )
.transform( new Function<Map<Integer, Optional<Object>>, Object[]>() {
@Override public Object[] apply( Map<Integer, Optional<Object>> input ) {
Object[] row = new Object[rowMeta.size()];
for ( Map.Entry<Integer, Optional<Object>> entry : input.entrySet() ) {
row[entry.getKey()] = entry.getValue().orNull();
}
return row;
}
} );
}
}