/* * Copyright © 2014-2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.internal.app.runtime.distributed; import co.cask.cdap.api.flow.FlowSpecification; import co.cask.cdap.api.flow.FlowletConnection; import co.cask.cdap.app.program.Program; import co.cask.cdap.common.queue.QueueName; import co.cask.cdap.data2.transaction.queue.QueueAdmin; import co.cask.cdap.data2.transaction.stream.StreamAdmin; import co.cask.cdap.internal.app.runtime.flow.FlowUtils; import co.cask.tephra.TransactionExecutorFactory; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; import org.apache.twill.api.TwillController; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import javax.annotation.concurrent.NotThreadSafe; /** * For updating number of flowlet instances */ @NotThreadSafe final class DistributedFlowletInstanceUpdater { private static final Logger LOG = LoggerFactory.getLogger(DistributedFlowletInstanceUpdater.class); private static final int MAX_WAIT_SECONDS = 30; private static final int SECONDS_PER_WAIT = 1; private final Program program; private final TwillController twillController; private final QueueAdmin queueAdmin; private final StreamAdmin streamAdmin; private final Multimap<String, QueueName> consumerQueues; private final TransactionExecutorFactory txExecutorFactory; DistributedFlowletInstanceUpdater(Program program, TwillController twillController, QueueAdmin queueAdmin, StreamAdmin streamAdmin, Multimap<String, QueueName> consumerQueues, TransactionExecutorFactory txExecutorFactory) { this.program = program; this.twillController = twillController; this.queueAdmin = queueAdmin; this.streamAdmin = streamAdmin; this.consumerQueues = consumerQueues; this.txExecutorFactory = txExecutorFactory; } void update(String flowletId, int newInstanceCount, FlowSpecification flowSpec) throws Exception { // Find all flowlets that are source of the given flowletId. Set<String> flowlets = getUpstreamFlowlets(flowSpec, flowletId, Sets.<String>newHashSet()); flowlets.add(flowletId); // Suspend all upstream flowlets and the flowlet that is going to change instances for (String id : flowlets) { waitForInstances(id, getInstances(flowSpec, id)); // Need to suspend one by one due to a bug in Twill (TWILL-123) twillController.sendCommand(id, ProgramCommands.SUSPEND).get(); } FlowUtils.reconfigure(consumerQueues.get(flowletId), FlowUtils.generateConsumerGroupId(program, flowletId), newInstanceCount, streamAdmin, queueAdmin, txExecutorFactory); twillController.changeInstances(flowletId, newInstanceCount).get(); for (String id : flowlets) { twillController.sendCommand(id, ProgramCommands.RESUME).get(); } } // wait until there are expectedInstances of the flowlet. This is needed to prevent the case where a suspend // command is sent before all flowlet instances have been registered in ZK, and then the change instance command // is sent after the new flowlet instances have started up, which will cause them to crash because // it cannot change instances without being in the suspended state. private void waitForInstances(String flowletId, int expectedInstances) throws InterruptedException, TimeoutException { int numRunningFlowlets = getNumberOfProvisionedInstances(flowletId); int secondsWaited = 0; while (numRunningFlowlets != expectedInstances) { LOG.debug("waiting for {} instances of {} before suspending flowlets", expectedInstances, flowletId); TimeUnit.SECONDS.sleep(SECONDS_PER_WAIT); secondsWaited += SECONDS_PER_WAIT; if (secondsWaited > MAX_WAIT_SECONDS) { String errmsg = String.format("waited %d seconds for instances of %s to reach expected count of %d, but %d are running", secondsWaited, flowletId, expectedInstances, numRunningFlowlets); LOG.error(errmsg); throw new TimeoutException(errmsg); } numRunningFlowlets = getNumberOfProvisionedInstances(flowletId); } } private int getNumberOfProvisionedInstances(String flowletId) { return twillController.getResourceReport().getRunnableResources(flowletId).size(); } private <T extends Collection<String>> T getUpstreamFlowlets(FlowSpecification flowSpec, String flowletId, T result) { for (FlowletConnection connection : flowSpec.getConnections()) { if (connection.getTargetName().equals(flowletId) && connection.getSourceType() == FlowletConnection.Type.FLOWLET) { result.add(connection.getSourceName()); } } return result; } private int getInstances(FlowSpecification flowSpec, String flowletId) { return flowSpec.getFlowlets().get(flowletId).getInstances(); } }