/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.optimizer.costs;
import static org.junit.Assert.*;
import org.apache.flink.optimizer.dag.EstimateProvider;
import org.junit.Test;
/**
* Tests for the cost formulas in the {@link DefaultCostEstimator}. Most of the tests establish relative
* relationships.
*/
public class DefaultCostEstimatorTest {
// estimates
private static final long SMALL_DATA_SIZE = 10000;
private static final long SMALL_RECORD_COUNT = 100;
private static final long MEDIUM_DATA_SIZE = 500000000L;
private static final long MEDIUM_RECORD_COUNT = 500000L;
private static final long BIG_DATA_SIZE = 100000000000L;
private static final long BIG_RECORD_COUNT = 100000000L;
private static final EstimateProvider UNKNOWN_ESTIMATES = new UnknownEstimates();
private static final EstimateProvider ZERO_ESTIMATES = new Estimates(0, 0);
private static final EstimateProvider SMALL_ESTIMATES = new Estimates(SMALL_DATA_SIZE, SMALL_RECORD_COUNT);
private static final EstimateProvider MEDIUM_ESTIMATES = new Estimates(MEDIUM_DATA_SIZE, MEDIUM_RECORD_COUNT);
private static final EstimateProvider BIG_ESTIMATES = new Estimates(BIG_DATA_SIZE, BIG_RECORD_COUNT);
private final CostEstimator costEstimator = new DefaultCostEstimator();
// --------------------------------------------------------------------------------------------
@Test
public void testShipStrategiesIsolated() {
testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 1);
testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 10);
testShipStrategiesIsolated(ZERO_ESTIMATES, 1);
testShipStrategiesIsolated(ZERO_ESTIMATES, 10);
testShipStrategiesIsolated(SMALL_ESTIMATES, 1);
testShipStrategiesIsolated(SMALL_ESTIMATES, 10);
testShipStrategiesIsolated(BIG_ESTIMATES, 1);
testShipStrategiesIsolated(BIG_ESTIMATES, 10);
}
private void testShipStrategiesIsolated(EstimateProvider estimates, int targetParallelism) {
Costs random = new Costs();
costEstimator.addRandomPartitioningCost(estimates, random);
Costs hash = new Costs();
costEstimator.addHashPartitioningCost(estimates, hash);
Costs range = new Costs();
costEstimator.addRangePartitionCost(estimates, range);
Costs broadcast = new Costs();
costEstimator.addBroadcastCost(estimates, targetParallelism, broadcast);
int randomVsHash = random.compareTo(hash);
int hashVsRange = hash.compareTo(range);
int hashVsBroadcast = hash.compareTo(broadcast);
int rangeVsBroadcast = range.compareTo(broadcast);
// repartition random is at most as expensive as hash partitioning
assertTrue(randomVsHash <= 0);
// range partitioning is always more expensive than hash partitioning
assertTrue(hashVsRange < 0);
// broadcasting is always more expensive than hash partitioning
if (targetParallelism > 1) {
assertTrue(hashVsBroadcast < 0);
} else {
assertTrue(hashVsBroadcast <= 0);
}
// range partitioning is not more expensive than broadcasting
if (targetParallelism > 1) {
assertTrue(rangeVsBroadcast < 0);
}
}
// --------------------------------------------------------------------------------------------
@Test
public void testShipStrategyCombinationsPlain() {
Costs hashBothSmall = new Costs();
Costs hashSmallAndLarge = new Costs();
Costs hashBothLarge = new Costs();
Costs hashSmallBcLarge10 = new Costs();
Costs hashLargeBcSmall10 = new Costs();
Costs hashSmallBcLarge1000 = new Costs();
Costs hashLargeBcSmall1000 = new Costs();
Costs forwardSmallBcLarge10 = new Costs();
Costs forwardLargeBcSmall10 = new Costs();
Costs forwardSmallBcLarge1000 = new Costs();
Costs forwardLargeBcSmall1000 = new Costs();
costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashBothSmall);
costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashBothSmall);
costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallAndLarge);
costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashSmallAndLarge);
costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashBothLarge);
costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashBothLarge);
costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallBcLarge10);
costEstimator.addBroadcastCost(BIG_ESTIMATES, 10, hashSmallBcLarge10);
costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashLargeBcSmall10);
costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 10, hashLargeBcSmall10);
costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallBcLarge1000);
costEstimator.addBroadcastCost(BIG_ESTIMATES, 1000, hashSmallBcLarge1000);
costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashLargeBcSmall1000);
costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 1000, hashLargeBcSmall1000);
costEstimator.addBroadcastCost(BIG_ESTIMATES, 10, forwardSmallBcLarge10);
costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 10, forwardLargeBcSmall10);
costEstimator.addBroadcastCost(BIG_ESTIMATES, 1000, forwardSmallBcLarge1000);
costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 1000, forwardLargeBcSmall1000);
// hash cost is roughly monotonous
assertTrue(hashBothSmall.compareTo(hashSmallAndLarge) < 0);
assertTrue(hashSmallAndLarge.compareTo(hashBothLarge) < 0);
// broadcast the smaller is better
assertTrue(hashLargeBcSmall10.compareTo(hashSmallBcLarge10) < 0);
assertTrue(forwardLargeBcSmall10.compareTo(forwardSmallBcLarge10) < 0);
assertTrue(hashLargeBcSmall1000.compareTo(hashSmallBcLarge1000) < 0);
assertTrue(forwardLargeBcSmall1000.compareTo(forwardSmallBcLarge1000) < 0);
// broadcasting small and forwarding large is better than partition both, given size difference
assertTrue(forwardLargeBcSmall10.compareTo(hashSmallAndLarge) < 0);
// broadcasting too far is expensive again
assertTrue(forwardLargeBcSmall1000.compareTo(hashSmallAndLarge) > 0);
// assert weight is respected
assertTrue(hashSmallBcLarge10.compareTo(hashSmallBcLarge1000) < 0);
assertTrue(hashLargeBcSmall10.compareTo(hashLargeBcSmall1000) < 0);
assertTrue(forwardSmallBcLarge10.compareTo(forwardSmallBcLarge1000) < 0);
assertTrue(forwardLargeBcSmall10.compareTo(forwardLargeBcSmall1000) < 0);
// forward versus hash
assertTrue(forwardSmallBcLarge10.compareTo(hashSmallBcLarge10) < 0);
assertTrue(forwardSmallBcLarge1000.compareTo(hashSmallBcLarge1000) < 0);
assertTrue(forwardLargeBcSmall10.compareTo(hashLargeBcSmall10) < 0);
assertTrue(forwardLargeBcSmall1000.compareTo(hashLargeBcSmall1000) < 0);
}
// --------------------------------------------------------------------------------------------
@Test
public void testShipStrategyCombinationsWithUnknowns() {
testShipStrategyCombinationsWithUnknowns(UNKNOWN_ESTIMATES);
testShipStrategyCombinationsWithUnknowns(ZERO_ESTIMATES);
testShipStrategyCombinationsWithUnknowns(SMALL_ESTIMATES);
testShipStrategyCombinationsWithUnknowns(MEDIUM_ESTIMATES);
testShipStrategyCombinationsWithUnknowns(BIG_ESTIMATES);
}
private void testShipStrategyCombinationsWithUnknowns(EstimateProvider knownEstimates) {
Costs hashBoth = new Costs();
Costs bcKnown10 = new Costs();
Costs bcUnknown10 = new Costs();
Costs bcKnown1000 = new Costs();
Costs bcUnknown1000 = new Costs();
costEstimator.addHashPartitioningCost(knownEstimates, hashBoth);
costEstimator.addHashPartitioningCost(UNKNOWN_ESTIMATES, hashBoth);
costEstimator.addBroadcastCost(knownEstimates, 10, bcKnown10);
costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 10, bcUnknown10);
costEstimator.addBroadcastCost(knownEstimates, 1000, bcKnown1000);
costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 1000, bcUnknown1000);
// if we do not know one of them, hashing both should be cheaper than anything
assertTrue(hashBoth.compareTo(bcKnown10) < 0);
assertTrue(hashBoth.compareTo(bcUnknown10) < 0);
assertTrue(hashBoth.compareTo(bcKnown1000) < 0);
assertTrue(hashBoth.compareTo(bcUnknown1000) < 0);
// there should be no bias in broadcasting a known or unknown size input
assertTrue(bcKnown10.compareTo(bcUnknown10) == 0);
assertTrue(bcKnown1000.compareTo(bcUnknown1000) == 0);
// replication factor does matter
assertTrue(bcKnown10.compareTo(bcKnown1000) < 0);
assertTrue(bcUnknown10.compareTo(bcUnknown1000) < 0);
}
// --------------------------------------------------------------------------------------------
@Test
public void testJoinCostFormulasPlain() {
// hash join costs
Costs hashBothSmall = new Costs();
Costs hashBothLarge = new Costs();
Costs hashSmallBuild = new Costs();
Costs hashLargeBuild = new Costs();
costEstimator.addHybridHashCosts(SMALL_ESTIMATES, BIG_ESTIMATES, hashSmallBuild, 1);
costEstimator.addHybridHashCosts(BIG_ESTIMATES, SMALL_ESTIMATES, hashLargeBuild, 1);
costEstimator.addHybridHashCosts(SMALL_ESTIMATES, SMALL_ESTIMATES, hashBothSmall, 1);
costEstimator.addHybridHashCosts(BIG_ESTIMATES, BIG_ESTIMATES, hashBothLarge, 1);
assertTrue(hashBothSmall.compareTo(hashSmallBuild) < 0);
assertTrue(hashSmallBuild.compareTo(hashLargeBuild) < 0);
assertTrue(hashLargeBuild.compareTo(hashBothLarge) < 0);
// merge join costs
Costs mergeBothSmall = new Costs();
Costs mergeBothLarge = new Costs();
Costs mergeSmallFirst = new Costs();
Costs mergeSmallSecond = new Costs();
costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeSmallFirst);
costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeSmallFirst);
costEstimator.addLocalMergeCost(SMALL_ESTIMATES, BIG_ESTIMATES, mergeSmallFirst, 1);
costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeSmallSecond);
costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeSmallSecond);
costEstimator.addLocalMergeCost(BIG_ESTIMATES, SMALL_ESTIMATES, mergeSmallSecond, 1);
costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeBothSmall);
costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeBothSmall);
costEstimator.addLocalMergeCost(SMALL_ESTIMATES, SMALL_ESTIMATES, mergeBothSmall, 1);
costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeBothLarge);
costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeBothLarge);
costEstimator.addLocalMergeCost(BIG_ESTIMATES, BIG_ESTIMATES, mergeBothLarge, 1);
assertTrue(mergeBothSmall.compareTo(mergeSmallFirst) < 0);
assertTrue(mergeBothSmall.compareTo(mergeSmallSecond) < 0);
assertTrue(mergeSmallFirst.compareTo(mergeSmallSecond) == 0);
assertTrue(mergeSmallFirst.compareTo(mergeBothLarge) < 0);
assertTrue(mergeSmallSecond.compareTo(mergeBothLarge) < 0);
// compare merge join and hash join costs
assertTrue(hashBothSmall.compareTo(mergeBothSmall) < 0);
assertTrue(hashBothLarge.compareTo(mergeBothLarge) < 0);
assertTrue(hashSmallBuild.compareTo(mergeSmallFirst) < 0);
assertTrue(hashSmallBuild.compareTo(mergeSmallSecond) < 0);
assertTrue(hashLargeBuild.compareTo(mergeSmallFirst) < 0);
assertTrue(hashLargeBuild.compareTo(mergeSmallSecond) < 0);
}
// --------------------------------------------------------------------------------------------
@Test
public void testJoinCostFormulasWithWeights() {
testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, SMALL_ESTIMATES);
testJoinCostFormulasWithWeights(SMALL_ESTIMATES, UNKNOWN_ESTIMATES);
testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, MEDIUM_ESTIMATES);
testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, UNKNOWN_ESTIMATES);
testJoinCostFormulasWithWeights(BIG_ESTIMATES, MEDIUM_ESTIMATES);
testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, BIG_ESTIMATES);
}
private void testJoinCostFormulasWithWeights(EstimateProvider e1, EstimateProvider e2) {
Costs hf1 = new Costs();
Costs hf5 = new Costs();
Costs hs1 = new Costs();
Costs hs5 = new Costs();
Costs mm1 = new Costs();
Costs mm5 = new Costs();
costEstimator.addHybridHashCosts(e1, e2, hf1, 1);
costEstimator.addHybridHashCosts(e1, e2, hf5, 5);
costEstimator.addHybridHashCosts(e2, e1, hs1, 1);
costEstimator.addHybridHashCosts(e2, e1, hs5, 5);
costEstimator.addLocalSortCost(e1, mm1);
costEstimator.addLocalSortCost(e2, mm1);
costEstimator.addLocalMergeCost(e1, e2, mm1, 1);
costEstimator.addLocalSortCost(e1, mm5);
costEstimator.addLocalSortCost(e2, mm5);
mm5.multiplyWith(5);
costEstimator.addLocalMergeCost(e1, e2, mm5, 5);
// weight 1 versus weight 5
assertTrue(hf1.compareTo(hf5) < 0);
assertTrue(hs1.compareTo(hs5) < 0);
assertTrue(mm1.compareTo(mm5) < 0);
// hash versus merge
assertTrue(hf1.compareTo(mm1) < 0);
assertTrue(hs1.compareTo(mm1) < 0);
assertTrue(hf5.compareTo(mm5) < 0);
assertTrue(hs5.compareTo(mm5) < 0);
}
// --------------------------------------------------------------------------------------------
@Test
public void testHashJoinCostFormulasWithCaches() {
Costs hashBothUnknown10 = new Costs();
Costs hashBothUnknownCached10 = new Costs();
Costs hashBothSmall10 = new Costs();
Costs hashBothSmallCached10 = new Costs();
Costs hashSmallLarge10 = new Costs();
Costs hashSmallLargeCached10 = new Costs();
Costs hashLargeSmall10 = new Costs();
Costs hashLargeSmallCached10 = new Costs();
Costs hashLargeSmall1 = new Costs();
Costs hashLargeSmallCached1 = new Costs();
costEstimator.addHybridHashCosts(UNKNOWN_ESTIMATES, UNKNOWN_ESTIMATES, hashBothUnknown10, 10);
costEstimator.addCachedHybridHashCosts(UNKNOWN_ESTIMATES, UNKNOWN_ESTIMATES, hashBothUnknownCached10, 10);
costEstimator.addHybridHashCosts(MEDIUM_ESTIMATES, MEDIUM_ESTIMATES, hashBothSmall10, 10);
costEstimator.addCachedHybridHashCosts(MEDIUM_ESTIMATES, MEDIUM_ESTIMATES, hashBothSmallCached10, 10);
costEstimator.addHybridHashCosts(MEDIUM_ESTIMATES, BIG_ESTIMATES, hashSmallLarge10, 10);
costEstimator.addCachedHybridHashCosts(MEDIUM_ESTIMATES, BIG_ESTIMATES, hashSmallLargeCached10, 10);
costEstimator.addHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmall10, 10);
costEstimator.addCachedHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmallCached10, 10);
costEstimator.addHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmall1, 1);
costEstimator.addCachedHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmallCached1, 1);
// cached variant is always cheaper
assertTrue(hashBothUnknown10.compareTo(hashBothUnknownCached10) > 0);
assertTrue(hashBothSmall10.compareTo(hashBothSmallCached10) > 0);
assertTrue(hashSmallLarge10.compareTo(hashSmallLargeCached10) > 0);
assertTrue(hashLargeSmall10.compareTo(hashLargeSmallCached10) > 0);
// caching the large side is better, because then the small one is the one with additional I/O
assertTrue(hashLargeSmallCached10.compareTo(hashSmallLargeCached10) < 0);
// a weight of one makes the caching the same as the non-cached variant
assertTrue(hashLargeSmall1.compareTo(hashLargeSmallCached1) == 0);
}
// --------------------------------------------------------------------------------------------
// Estimate providers
// --------------------------------------------------------------------------------------------
private static final class UnknownEstimates implements EstimateProvider {
@Override
public long getEstimatedOutputSize() { return -1; }
@Override
public long getEstimatedNumRecords() { return -1; }
@Override
public float getEstimatedAvgWidthPerOutputRecord() { return -1.0f; }
}
private static final class Estimates implements EstimateProvider {
private final long size;
private final long records;
private final float width;
public Estimates(long size, long records) {
this(size, records, -1.0f);
}
public Estimates(long size, long records, float width) {
this.size = size;
this.records = records;
this.width = width;
}
@Override
public long getEstimatedOutputSize() {
return this.size;
}
@Override
public long getEstimatedNumRecords() {
return this.records;
}
@Override
public float getEstimatedAvgWidthPerOutputRecord() {
return this.width;
}
}
}