/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.optimizer.costs; import static org.junit.Assert.*; import org.apache.flink.optimizer.dag.EstimateProvider; import org.junit.Test; /** * Tests for the cost formulas in the {@link DefaultCostEstimator}. Most of the tests establish relative * relationships. */ public class DefaultCostEstimatorTest { // estimates private static final long SMALL_DATA_SIZE = 10000; private static final long SMALL_RECORD_COUNT = 100; private static final long MEDIUM_DATA_SIZE = 500000000L; private static final long MEDIUM_RECORD_COUNT = 500000L; private static final long BIG_DATA_SIZE = 100000000000L; private static final long BIG_RECORD_COUNT = 100000000L; private static final EstimateProvider UNKNOWN_ESTIMATES = new UnknownEstimates(); private static final EstimateProvider ZERO_ESTIMATES = new Estimates(0, 0); private static final EstimateProvider SMALL_ESTIMATES = new Estimates(SMALL_DATA_SIZE, SMALL_RECORD_COUNT); private static final EstimateProvider MEDIUM_ESTIMATES = new Estimates(MEDIUM_DATA_SIZE, MEDIUM_RECORD_COUNT); private static final EstimateProvider BIG_ESTIMATES = new Estimates(BIG_DATA_SIZE, BIG_RECORD_COUNT); private final CostEstimator costEstimator = new DefaultCostEstimator(); // -------------------------------------------------------------------------------------------- @Test public void testShipStrategiesIsolated() { testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 1); testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 10); testShipStrategiesIsolated(ZERO_ESTIMATES, 1); testShipStrategiesIsolated(ZERO_ESTIMATES, 10); testShipStrategiesIsolated(SMALL_ESTIMATES, 1); testShipStrategiesIsolated(SMALL_ESTIMATES, 10); testShipStrategiesIsolated(BIG_ESTIMATES, 1); testShipStrategiesIsolated(BIG_ESTIMATES, 10); } private void testShipStrategiesIsolated(EstimateProvider estimates, int targetParallelism) { Costs random = new Costs(); costEstimator.addRandomPartitioningCost(estimates, random); Costs hash = new Costs(); costEstimator.addHashPartitioningCost(estimates, hash); Costs range = new Costs(); costEstimator.addRangePartitionCost(estimates, range); Costs broadcast = new Costs(); costEstimator.addBroadcastCost(estimates, targetParallelism, broadcast); int randomVsHash = random.compareTo(hash); int hashVsRange = hash.compareTo(range); int hashVsBroadcast = hash.compareTo(broadcast); int rangeVsBroadcast = range.compareTo(broadcast); // repartition random is at most as expensive as hash partitioning assertTrue(randomVsHash <= 0); // range partitioning is always more expensive than hash partitioning assertTrue(hashVsRange < 0); // broadcasting is always more expensive than hash partitioning if (targetParallelism > 1) { assertTrue(hashVsBroadcast < 0); } else { assertTrue(hashVsBroadcast <= 0); } // range partitioning is not more expensive than broadcasting if (targetParallelism > 1) { assertTrue(rangeVsBroadcast < 0); } } // -------------------------------------------------------------------------------------------- @Test public void testShipStrategyCombinationsPlain() { Costs hashBothSmall = new Costs(); Costs hashSmallAndLarge = new Costs(); Costs hashBothLarge = new Costs(); Costs hashSmallBcLarge10 = new Costs(); Costs hashLargeBcSmall10 = new Costs(); Costs hashSmallBcLarge1000 = new Costs(); Costs hashLargeBcSmall1000 = new Costs(); Costs forwardSmallBcLarge10 = new Costs(); Costs forwardLargeBcSmall10 = new Costs(); Costs forwardSmallBcLarge1000 = new Costs(); Costs forwardLargeBcSmall1000 = new Costs(); costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashBothSmall); costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashBothSmall); costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallAndLarge); costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashSmallAndLarge); costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashBothLarge); costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashBothLarge); costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallBcLarge10); costEstimator.addBroadcastCost(BIG_ESTIMATES, 10, hashSmallBcLarge10); costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashLargeBcSmall10); costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 10, hashLargeBcSmall10); costEstimator.addHashPartitioningCost(MEDIUM_ESTIMATES, hashSmallBcLarge1000); costEstimator.addBroadcastCost(BIG_ESTIMATES, 1000, hashSmallBcLarge1000); costEstimator.addHashPartitioningCost(BIG_ESTIMATES, hashLargeBcSmall1000); costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 1000, hashLargeBcSmall1000); costEstimator.addBroadcastCost(BIG_ESTIMATES, 10, forwardSmallBcLarge10); costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 10, forwardLargeBcSmall10); costEstimator.addBroadcastCost(BIG_ESTIMATES, 1000, forwardSmallBcLarge1000); costEstimator.addBroadcastCost(MEDIUM_ESTIMATES, 1000, forwardLargeBcSmall1000); // hash cost is roughly monotonous assertTrue(hashBothSmall.compareTo(hashSmallAndLarge) < 0); assertTrue(hashSmallAndLarge.compareTo(hashBothLarge) < 0); // broadcast the smaller is better assertTrue(hashLargeBcSmall10.compareTo(hashSmallBcLarge10) < 0); assertTrue(forwardLargeBcSmall10.compareTo(forwardSmallBcLarge10) < 0); assertTrue(hashLargeBcSmall1000.compareTo(hashSmallBcLarge1000) < 0); assertTrue(forwardLargeBcSmall1000.compareTo(forwardSmallBcLarge1000) < 0); // broadcasting small and forwarding large is better than partition both, given size difference assertTrue(forwardLargeBcSmall10.compareTo(hashSmallAndLarge) < 0); // broadcasting too far is expensive again assertTrue(forwardLargeBcSmall1000.compareTo(hashSmallAndLarge) > 0); // assert weight is respected assertTrue(hashSmallBcLarge10.compareTo(hashSmallBcLarge1000) < 0); assertTrue(hashLargeBcSmall10.compareTo(hashLargeBcSmall1000) < 0); assertTrue(forwardSmallBcLarge10.compareTo(forwardSmallBcLarge1000) < 0); assertTrue(forwardLargeBcSmall10.compareTo(forwardLargeBcSmall1000) < 0); // forward versus hash assertTrue(forwardSmallBcLarge10.compareTo(hashSmallBcLarge10) < 0); assertTrue(forwardSmallBcLarge1000.compareTo(hashSmallBcLarge1000) < 0); assertTrue(forwardLargeBcSmall10.compareTo(hashLargeBcSmall10) < 0); assertTrue(forwardLargeBcSmall1000.compareTo(hashLargeBcSmall1000) < 0); } // -------------------------------------------------------------------------------------------- @Test public void testShipStrategyCombinationsWithUnknowns() { testShipStrategyCombinationsWithUnknowns(UNKNOWN_ESTIMATES); testShipStrategyCombinationsWithUnknowns(ZERO_ESTIMATES); testShipStrategyCombinationsWithUnknowns(SMALL_ESTIMATES); testShipStrategyCombinationsWithUnknowns(MEDIUM_ESTIMATES); testShipStrategyCombinationsWithUnknowns(BIG_ESTIMATES); } private void testShipStrategyCombinationsWithUnknowns(EstimateProvider knownEstimates) { Costs hashBoth = new Costs(); Costs bcKnown10 = new Costs(); Costs bcUnknown10 = new Costs(); Costs bcKnown1000 = new Costs(); Costs bcUnknown1000 = new Costs(); costEstimator.addHashPartitioningCost(knownEstimates, hashBoth); costEstimator.addHashPartitioningCost(UNKNOWN_ESTIMATES, hashBoth); costEstimator.addBroadcastCost(knownEstimates, 10, bcKnown10); costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 10, bcUnknown10); costEstimator.addBroadcastCost(knownEstimates, 1000, bcKnown1000); costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 1000, bcUnknown1000); // if we do not know one of them, hashing both should be cheaper than anything assertTrue(hashBoth.compareTo(bcKnown10) < 0); assertTrue(hashBoth.compareTo(bcUnknown10) < 0); assertTrue(hashBoth.compareTo(bcKnown1000) < 0); assertTrue(hashBoth.compareTo(bcUnknown1000) < 0); // there should be no bias in broadcasting a known or unknown size input assertTrue(bcKnown10.compareTo(bcUnknown10) == 0); assertTrue(bcKnown1000.compareTo(bcUnknown1000) == 0); // replication factor does matter assertTrue(bcKnown10.compareTo(bcKnown1000) < 0); assertTrue(bcUnknown10.compareTo(bcUnknown1000) < 0); } // -------------------------------------------------------------------------------------------- @Test public void testJoinCostFormulasPlain() { // hash join costs Costs hashBothSmall = new Costs(); Costs hashBothLarge = new Costs(); Costs hashSmallBuild = new Costs(); Costs hashLargeBuild = new Costs(); costEstimator.addHybridHashCosts(SMALL_ESTIMATES, BIG_ESTIMATES, hashSmallBuild, 1); costEstimator.addHybridHashCosts(BIG_ESTIMATES, SMALL_ESTIMATES, hashLargeBuild, 1); costEstimator.addHybridHashCosts(SMALL_ESTIMATES, SMALL_ESTIMATES, hashBothSmall, 1); costEstimator.addHybridHashCosts(BIG_ESTIMATES, BIG_ESTIMATES, hashBothLarge, 1); assertTrue(hashBothSmall.compareTo(hashSmallBuild) < 0); assertTrue(hashSmallBuild.compareTo(hashLargeBuild) < 0); assertTrue(hashLargeBuild.compareTo(hashBothLarge) < 0); // merge join costs Costs mergeBothSmall = new Costs(); Costs mergeBothLarge = new Costs(); Costs mergeSmallFirst = new Costs(); Costs mergeSmallSecond = new Costs(); costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeSmallFirst); costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeSmallFirst); costEstimator.addLocalMergeCost(SMALL_ESTIMATES, BIG_ESTIMATES, mergeSmallFirst, 1); costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeSmallSecond); costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeSmallSecond); costEstimator.addLocalMergeCost(BIG_ESTIMATES, SMALL_ESTIMATES, mergeSmallSecond, 1); costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeBothSmall); costEstimator.addLocalSortCost(SMALL_ESTIMATES, mergeBothSmall); costEstimator.addLocalMergeCost(SMALL_ESTIMATES, SMALL_ESTIMATES, mergeBothSmall, 1); costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeBothLarge); costEstimator.addLocalSortCost(BIG_ESTIMATES, mergeBothLarge); costEstimator.addLocalMergeCost(BIG_ESTIMATES, BIG_ESTIMATES, mergeBothLarge, 1); assertTrue(mergeBothSmall.compareTo(mergeSmallFirst) < 0); assertTrue(mergeBothSmall.compareTo(mergeSmallSecond) < 0); assertTrue(mergeSmallFirst.compareTo(mergeSmallSecond) == 0); assertTrue(mergeSmallFirst.compareTo(mergeBothLarge) < 0); assertTrue(mergeSmallSecond.compareTo(mergeBothLarge) < 0); // compare merge join and hash join costs assertTrue(hashBothSmall.compareTo(mergeBothSmall) < 0); assertTrue(hashBothLarge.compareTo(mergeBothLarge) < 0); assertTrue(hashSmallBuild.compareTo(mergeSmallFirst) < 0); assertTrue(hashSmallBuild.compareTo(mergeSmallSecond) < 0); assertTrue(hashLargeBuild.compareTo(mergeSmallFirst) < 0); assertTrue(hashLargeBuild.compareTo(mergeSmallSecond) < 0); } // -------------------------------------------------------------------------------------------- @Test public void testJoinCostFormulasWithWeights() { testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, SMALL_ESTIMATES); testJoinCostFormulasWithWeights(SMALL_ESTIMATES, UNKNOWN_ESTIMATES); testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, MEDIUM_ESTIMATES); testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, UNKNOWN_ESTIMATES); testJoinCostFormulasWithWeights(BIG_ESTIMATES, MEDIUM_ESTIMATES); testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, BIG_ESTIMATES); } private void testJoinCostFormulasWithWeights(EstimateProvider e1, EstimateProvider e2) { Costs hf1 = new Costs(); Costs hf5 = new Costs(); Costs hs1 = new Costs(); Costs hs5 = new Costs(); Costs mm1 = new Costs(); Costs mm5 = new Costs(); costEstimator.addHybridHashCosts(e1, e2, hf1, 1); costEstimator.addHybridHashCosts(e1, e2, hf5, 5); costEstimator.addHybridHashCosts(e2, e1, hs1, 1); costEstimator.addHybridHashCosts(e2, e1, hs5, 5); costEstimator.addLocalSortCost(e1, mm1); costEstimator.addLocalSortCost(e2, mm1); costEstimator.addLocalMergeCost(e1, e2, mm1, 1); costEstimator.addLocalSortCost(e1, mm5); costEstimator.addLocalSortCost(e2, mm5); mm5.multiplyWith(5); costEstimator.addLocalMergeCost(e1, e2, mm5, 5); // weight 1 versus weight 5 assertTrue(hf1.compareTo(hf5) < 0); assertTrue(hs1.compareTo(hs5) < 0); assertTrue(mm1.compareTo(mm5) < 0); // hash versus merge assertTrue(hf1.compareTo(mm1) < 0); assertTrue(hs1.compareTo(mm1) < 0); assertTrue(hf5.compareTo(mm5) < 0); assertTrue(hs5.compareTo(mm5) < 0); } // -------------------------------------------------------------------------------------------- @Test public void testHashJoinCostFormulasWithCaches() { Costs hashBothUnknown10 = new Costs(); Costs hashBothUnknownCached10 = new Costs(); Costs hashBothSmall10 = new Costs(); Costs hashBothSmallCached10 = new Costs(); Costs hashSmallLarge10 = new Costs(); Costs hashSmallLargeCached10 = new Costs(); Costs hashLargeSmall10 = new Costs(); Costs hashLargeSmallCached10 = new Costs(); Costs hashLargeSmall1 = new Costs(); Costs hashLargeSmallCached1 = new Costs(); costEstimator.addHybridHashCosts(UNKNOWN_ESTIMATES, UNKNOWN_ESTIMATES, hashBothUnknown10, 10); costEstimator.addCachedHybridHashCosts(UNKNOWN_ESTIMATES, UNKNOWN_ESTIMATES, hashBothUnknownCached10, 10); costEstimator.addHybridHashCosts(MEDIUM_ESTIMATES, MEDIUM_ESTIMATES, hashBothSmall10, 10); costEstimator.addCachedHybridHashCosts(MEDIUM_ESTIMATES, MEDIUM_ESTIMATES, hashBothSmallCached10, 10); costEstimator.addHybridHashCosts(MEDIUM_ESTIMATES, BIG_ESTIMATES, hashSmallLarge10, 10); costEstimator.addCachedHybridHashCosts(MEDIUM_ESTIMATES, BIG_ESTIMATES, hashSmallLargeCached10, 10); costEstimator.addHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmall10, 10); costEstimator.addCachedHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmallCached10, 10); costEstimator.addHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmall1, 1); costEstimator.addCachedHybridHashCosts(BIG_ESTIMATES, MEDIUM_ESTIMATES, hashLargeSmallCached1, 1); // cached variant is always cheaper assertTrue(hashBothUnknown10.compareTo(hashBothUnknownCached10) > 0); assertTrue(hashBothSmall10.compareTo(hashBothSmallCached10) > 0); assertTrue(hashSmallLarge10.compareTo(hashSmallLargeCached10) > 0); assertTrue(hashLargeSmall10.compareTo(hashLargeSmallCached10) > 0); // caching the large side is better, because then the small one is the one with additional I/O assertTrue(hashLargeSmallCached10.compareTo(hashSmallLargeCached10) < 0); // a weight of one makes the caching the same as the non-cached variant assertTrue(hashLargeSmall1.compareTo(hashLargeSmallCached1) == 0); } // -------------------------------------------------------------------------------------------- // Estimate providers // -------------------------------------------------------------------------------------------- private static final class UnknownEstimates implements EstimateProvider { @Override public long getEstimatedOutputSize() { return -1; } @Override public long getEstimatedNumRecords() { return -1; } @Override public float getEstimatedAvgWidthPerOutputRecord() { return -1.0f; } } private static final class Estimates implements EstimateProvider { private final long size; private final long records; private final float width; public Estimates(long size, long records) { this(size, records, -1.0f); } public Estimates(long size, long records, float width) { this.size = size; this.records = records; this.width = width; } @Override public long getEstimatedOutputSize() { return this.size; } @Override public long getEstimatedNumRecords() { return this.records; } @Override public float getEstimatedAvgWidthPerOutputRecord() { return this.width; } } }