package com.hubspot.blazar.externalservice; import static com.hubspot.blazar.externalservice.BuildClusterService.BuildContainerInfo.BuildContainerState.NOT_STARTED; import static com.hubspot.blazar.externalservice.BuildClusterService.BuildContainerInfo.BuildContainerState.UNKNOWN; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.MoreObjects; import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.inject.Inject; import com.google.inject.Singleton; import com.hubspot.blazar.base.GitInfo; import com.hubspot.blazar.base.LogChunk; import com.hubspot.blazar.base.ModuleBuild; import com.hubspot.blazar.config.BlazarConfiguration; import com.hubspot.blazar.config.SingularityClusterConfiguration; import com.hubspot.blazar.data.service.BranchService; import com.hubspot.blazar.data.service.ModuleBuildService; import com.hubspot.blazar.data.service.ModuleService; import com.hubspot.blazar.exception.BuildClusterException; import com.hubspot.blazar.exception.LogNotFoundException; import com.hubspot.blazar.exception.NonRetryableBuildException; import com.hubspot.blazar.externalservice.BuildClusterService.BuildContainerInfo.BuildContainerState; import com.hubspot.blazar.util.TimeUtils; import com.hubspot.horizon.AsyncHttpClient; import com.hubspot.horizon.HttpRequest; import com.hubspot.horizon.HttpResponse; import com.hubspot.mesos.json.MesosFileChunkObject; import com.hubspot.singularity.ExtendedTaskState; import com.hubspot.singularity.SingularityS3Log; import com.hubspot.singularity.SingularitySandbox; import com.hubspot.singularity.SingularitySandboxFile; import com.hubspot.singularity.SingularityTaskCleanupResult; import com.hubspot.singularity.SingularityTaskHistory; import com.hubspot.singularity.SingularityTaskHistoryUpdate; import com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState; import com.hubspot.singularity.SingularityTaskIdHistory; import com.hubspot.singularity.api.SingularityKillTaskRequest; import com.hubspot.singularity.api.SingularityRunNowRequest; import com.hubspot.singularity.client.SingularityClient; @Singleton public class BuildClusterService { private static final Logger LOG = LoggerFactory.getLogger(BuildClusterService.class); private static final String DEFAULT_LOG_FILE_NAME = "service.log"; private final Map<String, SingularityClient> singularityClusterClients; private final BlazarConfiguration blazarConfiguration; private final ModuleService moduleService; protected final ModuleBuildService moduleBuildService; private final BranchService branchService; private final BuildClusterHealthChecker buildClusterHealthChecker; private final List<String> availableClusters; private final AtomicInteger nextClusterIndex; private final SingularityKillTaskRequest singularityKillTaskRequest; private final AsyncHttpClient asyncHttpClient; @Inject public BuildClusterService(Map<String, SingularityClient> singularityClusterClients, BlazarConfiguration blazarConfiguration, ModuleService moduleService, ModuleBuildService moduleBuildService, BranchService branchService, BuildClusterHealthChecker buildClusterHealthChecker, AsyncHttpClient asyncHttpClient) { this.singularityClusterClients = singularityClusterClients; this.blazarConfiguration = blazarConfiguration; this.moduleService = moduleService; this.moduleBuildService = moduleBuildService; this.buildClusterHealthChecker = buildClusterHealthChecker; this.branchService = branchService; this.asyncHttpClient = asyncHttpClient; this.singularityKillTaskRequest = new SingularityKillTaskRequest( Optional.of(true), Optional.of("The associated Blazar build has been cancelled"), Optional.absent(), Optional.absent(), Optional.absent()); availableClusters = ImmutableList.<String>builder().addAll(singularityClusterClients.keySet()).build(); nextClusterIndex = new AtomicInteger(0); } /** * It starts the docker container that is going to do the build * This doesn't initiate the build itself. The build executor should periodically call back to blazar and check * the module build state. Building can start when the state is LAUNCHING. At the point that we launch the build * container the state of the build is either 'QUEUED' or 'WAITING_FOR_UPSTREAM_BUILD' * @param moduleBuild * the module build that the launched container should execute * @throws NonRetryableBuildException */ public synchronized String launchBuildContainer(ModuleBuild moduleBuild) throws BuildClusterException { Optional<String> clusterToUseOptional = pickClusterToLaunchBuild(moduleBuild, new HashSet<>()); if (!clusterToUseOptional.isPresent()) { String message = String.format("Could not find a cluster to launch module build %d", moduleBuild.getId().get()); LOG.warn(message); throw new BuildClusterException(message); } final String clusterToUse = clusterToUseOptional.get(); SingularityClient singularityClient = singularityClusterClients.get(clusterToUse); SingularityClusterConfiguration singularityClusterConfiguration = blazarConfiguration.getSingularityClusterConfigurations().get(clusterToUse); try { singularityClient.runSingularityRequest(singularityClusterConfiguration.getRequest(), Optional.of(buildRequest(moduleBuild))); LOG.info("Run Blazar executor in Build cluster '{}' for module build {}", clusterToUse, moduleBuild.getId().get()); } catch (Exception e) { String message = String.format("Failed to start build container in cluster %s for module build %d", clusterToUse, moduleBuild.getId().get()); LOG.error(message, e); throw new BuildClusterException(message, e); } return clusterToUse; } public BuildContainerInfo getBuildContainerInfo(ModuleBuild moduleBuild) throws BuildClusterException { if (!moduleBuild.getBuildClusterName().isPresent()) { throw new BuildClusterException(String.format("The 'buildClusterName' is missing in module build %d. Cannot get the state of the associated build container", moduleBuild.getId().get())); } String buildClusterName = moduleBuild.getBuildClusterName().get(); if (isMesosCluster(buildClusterName)) { return getMesosContainerInfo(moduleBuild); } else { throw new BuildClusterException(String.format("Could not find build cluster name: '%s' among the configured build clusters", buildClusterName)); } } public void killBuildContainer(ModuleBuild moduleBuild) throws BuildClusterException { if (!moduleBuild.getBuildClusterName().isPresent()) { throw new BuildClusterException(String.format("The 'buildClusterName' is missing in module build %d. Cannot find and kill the associated build container", moduleBuild.getId().get())); } String buildClusterName = moduleBuild.getBuildClusterName().get(); if (isMesosCluster(buildClusterName)) { killMesosContainer(moduleBuild); } else { throw new BuildClusterException(String.format("Could not find build cluster name: '%s' among the configured build clusters", buildClusterName)); } } public LogChunk getBuildContainerLog(ModuleBuild moduleBuild, long byteOffset, long byteLength) throws Exception { if (!moduleBuild.getBuildClusterName().isPresent()) { throw new BuildClusterException(String.format("The 'buildClusterName' is missing in module build %d. Cannot get the log of the associated build container", moduleBuild.getId().get())); } String buildClusterName = moduleBuild.getBuildClusterName().get(); if (isMesosCluster(buildClusterName)) { return getMesosContainerLog(moduleBuild, byteOffset, byteLength); } else { throw new BuildClusterException(String.format("Could not find build cluster name: '%s' among the configured build clusters", buildClusterName)); } } public long getBuildContainerLogSize(ModuleBuild moduleBuild) throws BuildClusterException, LogNotFoundException { if (!moduleBuild.getBuildClusterName().isPresent()) { throw new BuildClusterException(String.format("The 'buildClusterName' is missing in module build %d. Cannot get the log size of the associated build container", moduleBuild.getId().get())); } String buildClusterName = moduleBuild.getBuildClusterName().get(); if (isMesosCluster(buildClusterName)) { return getMesosContainerLogSize(moduleBuild); } else { throw new BuildClusterException(String.format("Could not find build cluster name: '%s' among the configured build clusters. Cannot get the log size of the build container associated with module build %d", buildClusterName, moduleBuild.getId().get())); } } public String getBuildContainerLogUrl(ModuleBuild moduleBuild) throws BuildClusterException, LogNotFoundException { if (!moduleBuild.getBuildClusterName().isPresent()) { throw new BuildClusterException(String.format("The 'buildClusterName' is missing in module build %d. Cannot get the log url of the associated build container", moduleBuild.getId().get())); } String buildClusterName = moduleBuild.getBuildClusterName().get(); if (isMesosCluster(buildClusterName)) { return getMesosContainerLogUrl(moduleBuild); } else { throw new BuildClusterException(String.format("Could not find build cluster name: '%s' among the configured build clusters. Cannot get the log size of the build container associated with module build %d", buildClusterName, moduleBuild.getId().get())); } } private String getMesosContainerLogUrl(ModuleBuild moduleBuild) throws BuildClusterException, LogNotFoundException { if (!moduleBuild.getTaskId().isPresent()) { throw new LogNotFoundException(String.format("Cannot get container log for module build %s. Could not find the associated singularity task id.", moduleBuild.getId().get())); } String singularityTaskId = moduleBuild.getTaskId().get(); String buildClusterName = moduleBuild.getBuildClusterName().get(); SingularityClient singularityClient = singularityClusterClients.get(buildClusterName); Optional<SingularitySandbox> sandboxOptional = singularityClient.browseTaskSandBox(singularityTaskId, singularityTaskId); java.util.Optional<SingularitySandboxFile> buildLogFile = java.util.Optional.empty(); if (sandboxOptional.isPresent()) { buildLogFile = sandboxOptional.get().getFiles().stream().filter(logFile -> DEFAULT_LOG_FILE_NAME.equals(logFile.getName())).findFirst(); } final String buildLogUrl; if (buildLogFile.isPresent()) { String host = sandboxOptional.get().getSlaveHostname(); int port = blazarConfiguration.getSingularityClusterConfigurations().get(buildClusterName).getSlaveHttpPort(); String path = sandboxOptional.get().getFullPathToRoot() + "/" + sandboxOptional.get().getCurrentDirectory() + "/" + buildLogFile.get().getName(); buildLogUrl = String.format("http://%s:%d/files/download.json?path=%s", host, port, path); } else { SingularityS3Log urlData = findS3ServiceLog(singularityTaskId, singularityClient); buildLogUrl = urlData.getDownloadUrl(); } return buildLogUrl; } private long getMesosContainerLogSize(ModuleBuild moduleBuild) throws LogNotFoundException { if (!moduleBuild.getTaskId().isPresent()) { throw new LogNotFoundException(String.format("Cannot get container log for module build %s. Could not find the associated singularity task id.", moduleBuild.getId().get())); } String singularityTaskId = moduleBuild.getTaskId().get(); String buildClusterName = moduleBuild.getBuildClusterName().get(); SingularityClient singularityClient = singularityClusterClients.get(buildClusterName); String path = singularityTaskId + "/" + DEFAULT_LOG_FILE_NAME; Optional<MesosFileChunkObject> completeLogFileChunk = singularityClient.readSandBoxFile(singularityTaskId, path, Optional.absent(), Optional.absent(), Optional.absent()); long logFileSize; if (completeLogFileChunk.isPresent()) { logFileSize = completeLogFileChunk.get().getOffset(); } else { logFileSize = findS3ServiceLog(singularityTaskId, singularityClient).getSize(); } return logFileSize; } private LogChunk getMesosContainerLog(ModuleBuild moduleBuild, long byteOffset, long byteLength) throws BuildClusterException, LogNotFoundException { if (!moduleBuild.getTaskId().isPresent()) { throw new LogNotFoundException(String.format("Cannot get container log for module build %s. Could not find the associated singularity task id.", moduleBuild.getId().get())); } String singularityTaskId = moduleBuild.getTaskId().get(); String buildClusterName = moduleBuild.getBuildClusterName().get(); SingularityClient singularityClient = singularityClusterClients.get(buildClusterName); String path = singularityTaskId + "/" + DEFAULT_LOG_FILE_NAME; Optional<String> grep = Optional.absent(); Optional<MesosFileChunkObject> chunk = singularityClient.readSandBoxFile(singularityTaskId, path, grep, Optional.of(byteOffset), Optional.of(byteLength)); if (chunk.isPresent()) { if (chunk.get().getData().isEmpty() && logCompleted(moduleBuild, singularityClient)) { return new LogChunk(chunk.get().getData(), chunk.get().getOffset(), -1); } else { return new LogChunk(chunk.get().getData(), chunk.get().getOffset()); } } else { SingularityS3Log s3Log = findS3ServiceLog(singularityTaskId, singularityClient); if (byteOffset >= s3Log.getSize()) { return new LogChunk("", s3Log.getSize(), -1); } return readS3LogChunk(s3Log.getGetUrl(), byteOffset, byteLength); } } private SingularityS3Log findS3ServiceLog(String singularityTaskId, SingularityClient singularityClient) throws LogNotFoundException { Collection<SingularityS3Log> s3Logs = singularityClient.getTaskLogs(singularityTaskId); List<SingularityS3Log> serviceLogs = new ArrayList<>(); for (SingularityS3Log s3Log : s3Logs) { if (s3Log.getGetUrl().contains(DEFAULT_LOG_FILE_NAME)) { serviceLogs.add(s3Log); } } if (serviceLogs.isEmpty()) { throw new LogNotFoundException("No S3 log found for singularity task id" + singularityTaskId); } else if (serviceLogs.size() > 1) { throw new LogNotFoundException("Multiple S3 logs found for singularity task id" + singularityTaskId); } else { return serviceLogs.get(0); } } private boolean logCompleted(ModuleBuild moduleBuild, SingularityClient singularityClient) { return moduleBuild.getState().isComplete() && mesosContainerFinishedOrNeverLaunched(moduleBuild.getTaskId().get(), singularityClient); } private boolean mesosContainerFinishedOrNeverLaunched(String singularityTaskId, SingularityClient singularityClient) { // never launched Optional<SingularityTaskHistory> taskHistory = singularityClient.getHistoryForTask(singularityTaskId); if (!taskHistory.isPresent()) { return true; } // finished SimplifiedTaskState taskState = SingularityTaskHistoryUpdate.getCurrentState(taskHistory.get().getTaskUpdates()); return taskState == SimplifiedTaskState.DONE; } private LogChunk readS3LogChunk(String url, long offset, long length) throws BuildClusterException { HttpRequest request = HttpRequest.newBuilder() .setUrl(url) .addHeader("Range", String.format("bytes=%d-%d", offset, offset + length - 1)) .build(); HttpResponse response; try { response = asyncHttpClient.execute(request).get(); } catch (Exception e) { throw new BuildClusterException(String.format("An error occured while retrieving container log from S3. The error is: %s", e.getMessage()), e); } if (response.isSuccess()) { return new LogChunk(response.getAsBytes(), offset); } else { String message = String.format("Error reading S3 log, status code %d, response %s", response.getStatusCode(), response.getAsString()); throw new BuildClusterException(message); } } private void killMesosContainer(ModuleBuild moduleBuild) throws BuildClusterException { if (!moduleBuild.getTaskId().isPresent()) { throw new BuildClusterException(String.format("Cannot kill container for module build %s. Could not find the associated singularity task id.", moduleBuild.getId().get())); } long moduleBuildId = moduleBuild.getId().get(); try { String singularityTaskId = moduleBuild.getTaskId().get(); String buildClusterName = moduleBuild.getBuildClusterName().get(); SingularityClient singularityClient = singularityClusterClients.get(buildClusterName); Optional<SingularityTaskCleanupResult> result = singularityClient.killTask(singularityTaskId, Optional.of(singularityKillTaskRequest)); if (!result.isPresent()) { LOG.info("Tried to kill singularity task for module build {} but the task was not found", moduleBuildId); } else { LOG.info("Request to kill singularity task for module build {} was successfully sent. The result is {}", moduleBuildId, result.get().toString()); } } catch (Exception e) { LOG.error("The request to kill singularity task for module build {} failed. The error is: {}", moduleBuildId, e.getMessage(), e); throw new BuildClusterException(String.format("The request to kill singularity task for module build %d failed. The error is: %s", moduleBuildId, e.getMessage()), e); } } private BuildContainerInfo getMesosContainerInfo(ModuleBuild moduleBuild) { String buildClusterName = moduleBuild.getBuildClusterName().get(); String singularityRequestId = blazarConfiguration.getSingularityClusterConfigurations().get(buildClusterName).getRequest(); String runId = String.valueOf(moduleBuild.getId().get()); SingularityClient singularityClient = singularityClusterClients.get(buildClusterName); Optional<SingularityTaskIdHistory> task = singularityClient.getHistoryForTask(singularityRequestId, runId); if (!task.isPresent()) { return new BuildContainerInfo(NOT_STARTED, Optional.absent(), TimeUtils.nowInUtcMillis()); } long taskStateUpdatedAtMillis = task.get().getUpdatedAt(); Optional<ExtendedTaskState> taskState = task.get().getLastTaskState(); if (!taskState.isPresent()) { return new BuildContainerInfo(UNKNOWN, Optional.of(task.get().getTaskId().getId()), taskStateUpdatedAtMillis); } if (taskState.get().isDone()) { return new BuildContainerInfo(BuildContainerState.FINISHED, Optional.of(task.get().getTaskId().getId()), taskStateUpdatedAtMillis); } return new BuildContainerInfo(BuildContainerState.RUNNING, Optional.of(task.get().getTaskId().getId()), taskStateUpdatedAtMillis); } private boolean isMesosCluster(String buildClusterName) { return blazarConfiguration.getSingularityClusterConfigurations().keySet().contains(buildClusterName); } private Optional<String> pickClusterToLaunchBuild(ModuleBuild moduleBuild, Set<String> examinedClusters) throws BuildClusterException { if (examinedClusters.containsAll(availableClusters)) { return Optional.absent(); } long moduleBuildId = moduleBuild.getId().get(); String clusterToUse = getAndSetNextClusterToUse(); examinedClusters.add(clusterToUse); SingularityClusterConfiguration singularityClusterConfiguration = blazarConfiguration.getSingularityClusterConfigurations().get(clusterToUse); switch (singularityClusterConfiguration.getBuildStrategy()) { case ALWAYS: LOG.debug("Strategy for Build Cluster '{}' is ALWAYS. Will check if this cluster is healthy for executing module build {}", clusterToUse, moduleBuildId); return checkAvailabilityAndPersistCluster(clusterToUse, moduleBuild, examinedClusters); case WHITELIST: if (singularityClusterConfiguration.getRepositories().isEmpty()) { LOG.warn("You have selected the 'WHITELIST' build strategy for build cluster {} but you have not provided any repositories. " + "So the cluster is considered available for any repo build. Will check if it is healthy to execute module build {}", clusterToUse, moduleBuildId); return checkAvailabilityAndPersistCluster(clusterToUse, moduleBuild, examinedClusters); } Optional<String> moduleRepository = getModuleRepository(moduleBuild.getModuleId()); if (!moduleRepository.isPresent()) { throw new BuildClusterException(String.format("Could not get the repository for module %d", moduleBuild.getModuleId())); } boolean moduleRepositoryIsWhitelisted = singularityClusterConfiguration.getRepositories().stream() .anyMatch(whitelistedRepo -> whitelistedRepo.toLowerCase().equals(moduleRepository.get().toLowerCase())); if (moduleRepositoryIsWhitelisted) { LOG.debug("Strategy for Build Cluster '{}' is WHITELIST and repository {} is in the whitelist. Will check if the cluster is healthy for module build {}", clusterToUse, moduleRepository.get(), moduleBuildId); return checkAvailabilityAndPersistCluster(clusterToUse, moduleBuild, examinedClusters); } LOG.debug("Strategy for Build Cluster '{}' is WHITELIST but repository {} is NOT in the whitelist. Will try to pick another cluster for module build {}", clusterToUse, moduleRepository.get(), moduleBuildId); return pickClusterToLaunchBuild(moduleBuild, examinedClusters); case EXCLUSIVE_WHITELIST: case EMERGENCY: case EMERGENCY_AND_WHITELIST: case EMERGENCY_AND_EXCLUSIVE_WHITELIST: default: throw new BuildClusterException(String.format("Strategy %s is not yet implemented", singularityClusterConfiguration.getBuildStrategy())); } } private SingularityRunNowRequest buildRequest(ModuleBuild moduleBuild) { String buildId = Long.toString(moduleBuild.getId().get()); Optional<com.hubspot.mesos.Resources> buildResources = Optional.absent(); if (moduleBuild.getResolvedConfig().isPresent() && moduleBuild.getResolvedConfig().get().getBuildResources().isPresent()) { buildResources = Optional.of(moduleBuild.getResolvedConfig().get().getBuildResources().get().toMesosResources()); } return new SingularityRunNowRequest( Optional.of("Running Blazar module build " + buildId), Optional.of(false), Optional.of(buildId), Optional.of(Arrays.asList("--buildId", buildId)), buildResources); } private Optional<String> getModuleRepository(int moduleId) { int branchId = moduleService.getBranchIdFromModuleId(moduleId); Optional<GitInfo> moduleBranchMaybe = branchService.get(branchId); if (moduleBranchMaybe.isPresent()) { GitInfo moduleBranch = moduleBranchMaybe.get(); return Optional.of(String.format("%s-%s-%s", moduleBranch.getHost(), moduleBranch.getOrganization(), moduleBranch.getRepository())); } return Optional.absent(); } private Optional<String> checkAvailabilityAndPersistCluster(String clusterToUse, ModuleBuild moduleBuild, Set<String> examinedClusters) throws BuildClusterException { long moduleBuildId = moduleBuild.getId().get(); if (buildClusterHealthChecker.isClusterAvailable(clusterToUse)) { LOG.debug("Build cluster {} is healthy. Will be used to execute module build {}", clusterToUse, moduleBuildId); moduleBuildService.updateBuildClusterName(moduleBuildId, clusterToUse); return Optional.of(clusterToUse); } else { LOG.warn("Build cluster {} is not healthy for executing module build {}. Will look for another cluster", clusterToUse, moduleBuildId); return pickClusterToLaunchBuild(moduleBuild, examinedClusters); } } private String getAndSetNextClusterToUse() { int clusterIndex; try { clusterIndex = nextClusterIndex.incrementAndGet(); } catch (Exception e) { // in the almost improbable case we have reached the max int we reset the counter; clusterIndex = nextClusterIndex.getAndSet(0); } return availableClusters.get(clusterIndex % availableClusters.size()); } public static class BuildContainerInfo { public enum BuildContainerState { FINISHED, // The container has run and is not running any more, it could have been lost, killed, failed, succeeded RUNNING, // Container has started and build is still running NOT_STARTED, // Container has not yet started UNKNOWN // We got back incomplete information about the container and the state couldn't be determined } BuildContainerState state; /** * An identifier for the container that runs the build */ Optional<String>containerId; /** * Last time the state was updated */ long updatedAtMillis; public BuildContainerInfo(BuildContainerState state, Optional<String> containerId, long updatedAtMillis) { this.state = state; this.containerId = MoreObjects.firstNonNull(containerId, Optional.absent()); this.updatedAtMillis = updatedAtMillis; } public BuildContainerState getState() { return state; } public Optional<String> getContainerId() { return containerId; } public long getUpdatedAtMillis() { return updatedAtMillis; } } }