/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.physical.config; import java.util.List; import java.util.Map; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.drill.common.expression.LogicalExpression; import org.apache.drill.exec.physical.EndpointAffinity; import org.apache.drill.exec.physical.MinorFragmentEndpoint; import org.apache.drill.exec.physical.base.AbstractExchange; import org.apache.drill.exec.physical.base.PhysicalOperator; import org.apache.drill.exec.physical.base.Sender; import org.apache.drill.exec.planner.fragment.ParallelizationInfo; import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint; import com.fasterxml.jackson.annotation.JsonProperty; /** * DeMuxExchange is opposite of MuxExchange. It is used when the sender has overhead that is proportional to the * number of receivers. DeMuxExchange is run one instance per Drillbit endpoint which collects and distributes data * belonging to local receiving fragments running on the same Drillbit. * * Example: * On a 3 node cluster, if the sender has 10 receivers on each node each sender requires 30 buffers. By inserting * DeMuxExchange, we create one receiver per node which means total of 3 receivers for each sender. If the number of * senders is 10, we use 10*3 buffers instead of 10*30. DeMuxExchange has a overhead of buffer space that is equal to * number of local receivers. In this case each DeMuxExchange needs 10 buffers, so total of 3*10 buffers. */ public abstract class AbstractDeMuxExchange extends AbstractExchange { protected final LogicalExpression expr; // Ephemeral info used when creating execution fragments. protected Map<Integer, MinorFragmentEndpoint> receiverToSenderMapping; protected ArrayListMultimap<Integer, MinorFragmentEndpoint> senderToReceiversMapping; private boolean isSenderReceiverMappingCreated; public AbstractDeMuxExchange(@JsonProperty("child") PhysicalOperator child, @JsonProperty("expr") LogicalExpression expr) { super(child); this.expr = expr; } @JsonProperty("expr") public LogicalExpression getExpression(){ return expr; } @Override public ParallelizationInfo getSenderParallelizationInfo(List<DrillbitEndpoint> receiverFragmentEndpoints) { Preconditions.checkArgument(receiverFragmentEndpoints != null && receiverFragmentEndpoints.size() > 0, "Receiver fragment endpoint list should not be empty"); // We want to run one demux sender per Drillbit endpoint. // Identify the number of unique Drillbit endpoints in receiver fragment endpoints. List<DrillbitEndpoint> drillbitEndpoints = ImmutableSet.copyOf(receiverFragmentEndpoints).asList(); List<EndpointAffinity> affinities = Lists.newArrayList(); for(DrillbitEndpoint ep : drillbitEndpoints) { affinities.add(new EndpointAffinity(ep, Double.POSITIVE_INFINITY)); } return ParallelizationInfo.create(affinities.size(), affinities.size(), affinities); } @Override public ParallelizationInfo getReceiverParallelizationInfo(List<DrillbitEndpoint> senderFragmentEndpoints) { return ParallelizationInfo.UNLIMITED_WIDTH_NO_ENDPOINT_AFFINITY; } @Override public Sender getSender(int minorFragmentId, PhysicalOperator child) { createSenderReceiverMapping(); List<MinorFragmentEndpoint> receivers = senderToReceiversMapping.get(minorFragmentId); if (receivers == null || receivers.size() <= 0) { throw new IllegalStateException(String.format("Failed to find receivers for sender [%d]", minorFragmentId)); } return new HashPartitionSender(receiverMajorFragmentId, child, expr, receivers); } /** * In DeMuxExchange, sender fragment parallelization and endpoint assignment depends on receiver fragment endpoint * assignments. */ @Override public ParallelizationDependency getParallelizationDependency() { return ParallelizationDependency.SENDER_DEPENDS_ON_RECEIVER; } protected void createSenderReceiverMapping() { if (isSenderReceiverMappingCreated) { return; } senderToReceiversMapping = ArrayListMultimap.create(); receiverToSenderMapping = Maps.newHashMap(); // Find the list of receiver fragment ids assigned to each Drillbit endpoint ArrayListMultimap<DrillbitEndpoint, Integer> endpointReceiverList = ArrayListMultimap.create(); int receiverFragmentId = 0; for(DrillbitEndpoint receiverLocation : receiverLocations) { endpointReceiverList.put(receiverLocation, receiverFragmentId); receiverFragmentId++; } int senderFragmentId = 0; for(DrillbitEndpoint senderLocation : senderLocations) { final List<Integer> receiverMinorFragmentIds = endpointReceiverList.get(senderLocation); for(Integer receiverId : receiverMinorFragmentIds) { receiverToSenderMapping.put(receiverId, new MinorFragmentEndpoint(senderFragmentId, senderLocation)); senderToReceiversMapping.put(senderFragmentId, new MinorFragmentEndpoint(receiverId, receiverLocations.get(receiverId))); } senderFragmentId++; } isSenderReceiverMappingCreated = true; } }