/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.job.spawn.balancer;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import com.addthis.basis.net.HttpUtil;
import com.addthis.basis.util.Parameter;
import com.addthis.hydra.job.Job;
import com.addthis.hydra.job.JobTask;
import com.addthis.hydra.job.mq.HostState;
import com.addthis.hydra.job.spawn.HostManager;
import com.addthis.hydra.job.spawn.Spawn;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A class in charge of estimating the full size of job tasks, since JobTask.getByteCount does not include backups.
* <p/>
* This class uses a simplifying assumption that all tasks within a single job have the same ratio between the actual
* byte count and the expected the byte count, which is empirically true in nearly all cases. It is also assumed that
* this ratio will remain roughly constant on the time scale of hours.
*/
public class SpawnBalancerTaskSizer {
private static final Logger log = LoggerFactory.getLogger(SpawnBalancerTaskSizer.class);
private static final double defaultRatio = Double.parseDouble(Parameter.value("spawn.balancer.task.sizer.defaultratio", "1.5"));
private static final double maxRatio = Double.parseDouble(Parameter.value("spawn.balancer.task.sizer.maxratio", "10.0"));
private static final double minRatio = Double.parseDouble(
Parameter.value("spawn.balancer.task.sizer.minratio", "1.0"));
private static final long queueConsumptionInterval = Parameter.longValue("spawn.balancer.task.sizer.interval",
60 * 1000);
private static final int ratioExpirationHours = Parameter.intValue("spawn.balancer.task.sizer.ratio.expire", 12);
private static final AtomicBoolean pollingStarted = new AtomicBoolean(false);
private final Spawn spawn;
private final HostManager hostManager;
private final LinkedHashMap<String, Integer> queuedJobIds;
private final Cache<String, Double> cachedJobRatios;
public SpawnBalancerTaskSizer(Spawn spawn, HostManager hostManager) {
queuedJobIds = new LinkedHashMap<>();
cachedJobRatios = CacheBuilder.newBuilder().expireAfterWrite(ratioExpirationHours, TimeUnit.HOURS).build();
this.spawn = spawn;
this.hostManager = hostManager;
}
public void startPolling(ScheduledExecutorService executor) {
if (pollingStarted.compareAndSet(false, true)) {
executor.scheduleWithFixedDelay(new QueueConsumer(), queueConsumptionInterval, queueConsumptionInterval, TimeUnit.MILLISECONDS);
}
}
/**
* Estimate the true size of a task, using the cached ratio if it is available
* This method is designed to always return immediately so SpawnBalancer can continue working immediately.
* If necessary, it will queue a more accurate value to be updated at some point in the future.
*
* @param task The task to estimate the size for
* @return An estimated number of bytes for the true size of the task
*/
public long estimateTrueSize(JobTask task) {
if (task == null) {
return 0L;
}
long taskReportedSize = getReportedSize(task);
Double cachedRatio = cachedJobRatios.getIfPresent(task.getJobUUID());
if (cachedRatio == null) {
requestJobSizeFetch(task.getJobUUID(), task.getTaskID());
return (long) (defaultRatio * taskReportedSize);
}
return (long) (cachedRatio * taskReportedSize);
}
private long getReportedSize(JobTask task) {
/* Return task.getByteCount if it is sensible.
Sometimes tasks erroneously report 0 bytes used. Use the job average in this case. */
long byteCount = task != null ? task.getByteCount() : 0;
if (task == null || byteCount > 0) {
return byteCount;
}
Job job = spawn.getJob(task.getJobUUID());
if (job != null) {
return job.calcAverageTaskSizeBytes();
}
return byteCount;
}
public void requestJobSizeFetch(String jobId, int taskId) {
synchronized (queuedJobIds) {
if (cachedJobRatios.getIfPresent(jobId) != null || queuedJobIds.containsKey(jobId)) {
return;
}
queuedJobIds.put(jobId, taskId);
}
}
/**
* Fetch the true size of a task from the minion that currently hosts it
*
* @param jobId The job id of the task
* @param taskId The task # within the job
* @return A non-negative number of bytes on success; -1 otherwise
*/
private long fetchTaskTrueSize(String jobId, int taskId) {
JobTask task = spawn.getTask(jobId, taskId);
if (task == null) {
return -1;
}
HostState liveHost = hostManager.getHostState(task.getHostUUID());
if (liveHost == null) {
return -1;
}
String url = "http://" + liveHost.getHost() + ":" + liveHost.getPort() + "/task.size?id=" + jobId + "&node=" + taskId;
try {
byte[] result = HttpUtil.httpGet(url, 0).getBody();
return Long.parseLong(new String(result));
} catch (Exception e) {
log.warn("Failed to fetch task size for " + task.getJobKey());
}
return -1;
}
/**
* Take the earliest job to be requested on the queue, fetch the ratio for that job, and update the cache
*/
private void consumeFromQueue() {
String jobId;
int taskId;
synchronized (queuedJobIds) {
if (queuedJobIds.isEmpty()) {
return;
}
Map.Entry<String, Integer> entry = queuedJobIds.entrySet().iterator().next();
queuedJobIds.remove(entry.getKey());
jobId = entry.getKey();
taskId = entry.getValue();
}
long trueSize = fetchTaskTrueSize(jobId, taskId);
long reportedSize = getReportedSize(spawn.getTask(jobId, taskId));
double ratio = getRatio(trueSize, reportedSize);
log.info("[spawn.balancer.task.sizer] updated ratio for job " + jobId + " reported=" + reportedSize + " true=" + trueSize + " ratio=" + ratio);
cachedJobRatios.put(jobId, ratio);
}
/**
* Get the ratio for the specified true/reported sizes, subject to some sanity restraints
*
* @param trueSize The true size as reported by minion running du
* @param reportedSize The result of JobTask.getByteCount
* @return The "sanity-ized" ratio
*/
private static double getRatio(long trueSize, long reportedSize) {
if (trueSize <= 0 || reportedSize <= 0) {
return defaultRatio;
}
double rawRatio = (double) trueSize / reportedSize;
return Math.max(minRatio, Math.min(rawRatio, maxRatio));
}
/**
* A runnable that will consume from the queue of jobs that SpawnBalancer wants to size up
*/
private class QueueConsumer implements Runnable {
@Override
public void run() {
try {
consumeFromQueue();
} catch (Exception e) {
log.warn("Failed to consume from TaskSizer queue: " + e, e);
}
}
}
}