/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.job;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import com.addthis.basis.util.JitterClock;
import com.addthis.basis.util.Parameter;
import com.addthis.basis.util.RollingLog;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.codec.json.CodecJSON;
import com.addthis.hydra.minion.Minion;
import com.addthis.hydra.util.LogUtil;
import com.addthis.hydra.util.StringMapHelper;
import com.addthis.maljson.JSONObject;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* for job submission and tracking
* IJob that keeps everything in gone Codable Object graph
*/
@JsonAutoDetect(getterVisibility = JsonAutoDetect.Visibility.NONE,
isGetterVisibility = JsonAutoDetect.Visibility.NONE,
setterVisibility = JsonAutoDetect.Visibility.NONE)
public final class Job implements IJob {
private static final Logger log = LoggerFactory.getLogger(Job.class);
private static final Comparator<JobTask> taskNodeComparator =
(t1, t2) -> Integer.compare(t1.getTaskID(), t2.getTaskID());
@FieldConfig private int state;
@FieldConfig private int countActiveTasks;
/* creator of the job */
@FieldConfig private String creator;
/* owner of the job */
@FieldConfig private String owner;
/* group of the job */
@FieldConfig private String group;
/* can the owner modify the job */
@FieldConfig private boolean ownerWritable;
/* can the group modify the job */
@FieldConfig private boolean groupWritable;
/* can the world modify the job */
@FieldConfig private boolean worldWritable;
/* can the owner start/stop the job */
@FieldConfig private boolean ownerExecutable;
/* can the group start/stop the job */
@FieldConfig private boolean groupExecutable;
/* can the world start/stop the job */
@FieldConfig private boolean worldExecutable;
/* user who last modified the job */
@FieldConfig private String lastModifiedBy;
/* last modification time */
@FieldConfig private long lastModifiedAt;
/* purely ornamental description of this job */
@FieldConfig private String description;
/* key used for storing / retrieving this job */
@FieldConfig private String id;
/* higher means more important */
@FieldConfig private int priority;
/* Unix epoch offset of time job was created */
@FieldConfig private Long createTime;
/* Unix epoch offset of time job was last submitted */
@FieldConfig private Long submitTime;
/* Unix epoch offset of time first job node was assigned */
@FieldConfig private Long startTime;
/* Unix epoch offset of time last job node completed */
@FieldConfig private Long endTime;
/* minutes between re-kicking */
@FieldConfig private Long rekickTimeout;
/* minutes max time to allocate to job before it's interrupted */
@FieldConfig private Long maxRunTime;
/* list of nodes and their state */
@FieldConfig private ArrayList<JobTask> nodes;
/* JSON configuration url -- only read at submit time if conf empty */
@FieldConfig private String config;
/* URL for spawn to call on job complete. for automating workflows */
@FieldConfig private String onComplete;
@FieldConfig private String onError;
/* timeout in seconds */
@FieldConfig private int onCompleteTimeout;
@FieldConfig private int onErrorTimeout;
@FieldConfig private int runCount;
@FieldConfig private long runTime;
@FieldConfig private String command;
@FieldConfig private boolean disabled;
@FieldConfig private ArrayList<JobParameter> parameters;
@FieldConfig private int hourlyBackups;
@FieldConfig private int dailyBackups;
@FieldConfig private int weeklyBackups;
@FieldConfig private int monthlyBackups;
@FieldConfig private int replicas;
@FieldConfig private int readOnlyReplicas;
@FieldConfig private boolean dontAutoBalanceMe;
@FieldConfig private boolean dontDeleteMe;
@FieldConfig private boolean dontCloneMe;
@FieldConfig private boolean wasStopped;
@FieldConfig private int maxSimulRunning;
@FieldConfig private String minionType;
@FieldConfig private boolean autoRetry;
@FieldConfig private boolean basicAlerts;
@FieldConfig private boolean basicPages;
@FieldConfig private JobQueryConfig queryConfig;
/* If all errored tasks from an errored job are resolved and the job has started within this cutoff, automatically
enable the job. Default is 3 days. */
private static final long AUTO_ENABLE_CUTOFF = Parameter.longValue("job.enable.cutoff", 1000 * 60 * 60 * 24 * 3);
/* Task states that indicate that a job can be considered done. Rebalance/host-failure replications are included so
these long-running operations will not delay the job rekick. */
private static final Set<JobTaskState> taskStatesToFinishJob = ImmutableSet.of(
JobTaskState.IDLE, JobTaskState.ERROR, JobTaskState.REBALANCE, JobTaskState.FULL_REPLICATE);
// For codec only
public Job() {}
public Job(String id) {
this(id, null);
}
public Job(String id, String creator) {
this.id = id;
this.creator = creator;
this.createTime = JitterClock.globalTime();
this.endTime = createTime;
this.dontAutoBalanceMe = false;
this.dontDeleteMe = false;
this.dontCloneMe = false;
this.config = "";
this.queryConfig = new JobQueryConfig();
}
public Job(IJob job) {
this.id = job.getId();
this.setState(job.getState());
this.creator = job.getCreator();
this.owner = job.getOwner();
this.group = job.getGroup();
this.ownerWritable = job.isOwnerWritable();
this.groupWritable = job.isGroupWritable();
this.worldWritable = job.isWorldWritable();
this.ownerExecutable = job.isOwnerExecutable();
this.groupExecutable = job.isGroupExecutable();
this.worldExecutable = job.isWorldExecutable();
this.lastModifiedBy = job.lastModifiedBy();
this.lastModifiedAt = job.lastModifiedAt();
this.description = job.getDescription();
this.priority = job.getPriority();
this.createTime = job.getCreateTime();
this.submitTime = job.getSubmitTime();
this.startTime = job.getStartTime();
this.endTime = job.getEndTime();
this.rekickTimeout = job.getRekickTimeout();
this.maxRunTime = job.getMaxRunTime();
this.setTasks(job.getCopyOfTasks());
recountActiveTasks();
this.config = job.getConfig();
this.onComplete = job.getOnCompleteURL();
this.onError = job.getOnErrorURL();
this.onCompleteTimeout = job.getOnCompleteTimeout();
this.onErrorTimeout = job.getOnErrorTimeout();
this.runCount = job.getRunCount();
this.runTime = job.getRunTime();
this.command = job.getCommand();
this.parameters = job.getParameters() != null ? Lists.newArrayList(job.getParameters()) : null;
this.hourlyBackups = job.getHourlyBackups();
this.dailyBackups = job.getDailyBackups();
this.weeklyBackups = job.getWeeklyBackups();
this.monthlyBackups = job.getMonthlyBackups();
this.autoRetry = job.getAutoRetry();
this.basicAlerts = job.getBasicAlerts();
this.basicPages = job.getBasicPages();
this.replicas = job.getReplicas();
this.queryConfig = job.getQueryConfig();
this.dontAutoBalanceMe = job.getDontAutoBalanceMe();
this.dontDeleteMe = job.getDontDeleteMe();
this.dontCloneMe = job.getDontCloneMe();
this.maxSimulRunning = job.getMaxSimulRunning();
this.minionType = job.getMinionType();
this.wasStopped = job.getWasStopped();
setEnabled(job.isEnabled());
}
@Override
public String getId() {
return id;
}
@Override
public String getCreator() {
return creator;
}
@Override
public void setCreator(String creator) {
this.creator = creator;
}
@Override
public String getOwner() {
return owner;
}
@Override
public void setOwner(String owner) {
this.owner = owner;
}
@Override
public String getGroup() {
return group;
}
@Override
public void setGroup(String group) {
this.group = group;
}
@Override
public boolean isOwnerWritable() {
return ownerWritable;
}
@Override
public void setOwnerWritable(boolean ownerWritable) {
this.ownerWritable = ownerWritable;
}
@Override
public boolean isGroupWritable() {
return groupWritable;
}
@Override
public void setGroupWritable(boolean groupWritable) {
this.groupWritable = groupWritable;
}
@Override
public boolean isWorldWritable() {
return worldWritable;
}
@Override
public void setWorldWritable(boolean worldWritable) {
this.worldWritable = worldWritable;
}
@Override
public boolean isOwnerExecutable() {
return ownerExecutable;
}
@Override
public void setOwnerExecutable(boolean ownerExecutable) {
this.ownerExecutable = ownerExecutable;
}
@Override
public boolean isGroupExecutable() {
return groupExecutable;
}
@Override
public void setGroupExecutable(boolean groupExecutable) {
this.groupExecutable = groupExecutable;
}
@Override
public boolean isWorldExecutable() {
return worldExecutable;
}
@Override
public void setWorldExecutable(boolean worldExecutable) {
this.worldExecutable = worldExecutable;
}
@Override
public String lastModifiedBy() {
return lastModifiedBy;
}
@Override
public void setLastModifiedBy(String user) {
this.lastModifiedBy = user;
}
@Override
public long lastModifiedAt() {
return lastModifiedAt;
}
@Override
public void setLastModifiedAt(long time) {
this.lastModifiedAt = time;
}
@Override
public long getCreateTime() {
return createTime;
}
@Override
public String getDescription() {
return description;
}
@Override
public void setDescription(String description) {
this.description = description;
}
@Override
public String getCommand() {
return command;
}
@Override
public void setCommand(String command) {
this.command = command;
}
@Override
public int getPriority() {
return priority;
}
@Override
public void setPriority(int priority) {
this.priority = priority;
}
@Override
public Long getSubmitTime() {
return submitTime;
}
@Override
public void setSubmitTime(long submitTime) {
this.submitTime = submitTime;
}
@Override
public Long getStartTime() {
return startTime;
}
@Override
public void setStartTime(@Nullable Long startTime) {
this.startTime = startTime;
}
@Override
public Long getEndTime() {
return endTime;
}
@Override
public void setEndTime(@Nullable Long endTime) {
this.endTime = endTime;
}
public void setFinishTime(long finishTime) {
if (startTime != null) {
runTime += finishTime - startTime;
}
endTime = finishTime;
}
@Override
public Long getRekickTimeout() {
return rekickTimeout;
}
@Override
public void setRekickTimeout(Long rekick) {
rekickTimeout = rekick != null && rekick > 0 ? rekick : null;
}
@Override
public Long getMaxRunTime() {
return maxRunTime;
}
@Override
public void setMaxRunTime(Long maxRunTime) {
this.maxRunTime = maxRunTime;
}
@Override
public boolean isEnabled() {
return !disabled;
}
@Override
public boolean setEnabled(boolean enabled) {
if (enabled == disabled) {
disabled = !enabled;
// Determine new states
if (enabled && state == JobState.ERROR.getValue()) {
for (JobTask task : getCopyOfTasks()) {
JobTaskState state = task.getState();
task.setErrorCode(0);
task.setPreFailErrorCode(0);
if (state == JobTaskState.ERROR) {
setTaskState(task, JobTaskState.IDLE, true);
}
}
calculateJobState(true);
} else if (enabled && state == JobState.DEGRADED.getValue()) {
// Clear degraded state by recalculating
calculateJobState(true);
}
return true;
}
return false;
}
@Override
public Collection<JobParameter> getParameters() {
return parameters;
}
@Override
public void setParameters(Collection<JobParameter> parameters) {
if (parameters != null) {
this.parameters = new ArrayList<>(parameters.size());
this.parameters.addAll(parameters);
} else {
this.parameters = null;
}
}
@Override
public String getConfig() {
return config;
}
@Override
public void setConfig(@Nullable String config) {
this.config = config;
}
@Override
public String getOnCompleteURL() { return onComplete; }
@Override
public void setOnCompleteURL(String url) { this.onComplete = url; }
@Override
public String getOnErrorURL() { return onError; }
@Override
public void setOnErrorURL(String url) { this.onError = url; }
@Override
public int getOnCompleteTimeout() { return onCompleteTimeout; }
@Override
public void setOnCompleteTimeout(int timeout) { this.onCompleteTimeout = timeout; }
@Override
public int getOnErrorTimeout() { return onErrorTimeout; }
@Override
public void setOnErrorTimeout(int timeout) { this.onErrorTimeout = timeout; }
@Override
public int getReplicas() {
return replicas;
}
@Override
public void setReplicas(int replicas) {
this.replicas = replicas;
}
@Override
public int getRunCount() {
return runCount;
}
@Override
public int incrementRunCount() {
return ++runCount;
}
@Override
public long getRunTime() {
return runTime;
}
@Override
public JobState getState() {
JobState jobState = JobState.makeState(state);
return jobState == null ? JobState.UNKNOWN : jobState;
}
@Override
public boolean setState(JobState state) {
return setState(state, false);
}
public boolean setState(JobState state, boolean force) {
JobState curr = getState();
if (force
|| (isEnabled() && curr.canTransition(state))
|| (!isEnabled() && (state == JobState.IDLE))
|| (!isEnabled() && (state == JobState.ERROR))) {
// Note dependence on ordering!
this.state = state.ordinal();
return true;
} else if (state != curr) {
log.warn("[job.setstate] {}job {} cannot transition {} -> {}",
(disabled) ? "disabled " : "", getId(), curr, state);
for (StackTraceElement elt : Thread.currentThread().getStackTrace()) {
log.warn(elt.toString());
}
return false;
}
return true;
}
@Override
public int getTaskCount() {
return nodes.size();
}
@Nullable @Override
public synchronized JobTask getTask(int id) {
if (nodes == null) {
return null;
}
for (JobTask node : nodes) {
if (node.getTaskID() == id) {
node.setJobUUID(this.id);
return node;
}
}
return null;
}
@Override
public synchronized List<JobTask> getCopyOfTasks() {
if (nodes == null) {
nodes = new ArrayList<>();
}
return ImmutableList.copyOf(nodes);
}
public List<JobTask> getCopyOfTasksSorted() {
if (nodes == null) {
nodes = new ArrayList<>();
}
List<JobTask> tasksCopy = Lists.newArrayList(nodes);
Collections.sort(tasksCopy, taskNodeComparator);
return tasksCopy;
}
@Override
public synchronized void addTask(JobTask task) {
if (nodes == null) {
nodes = new ArrayList<>();
}
nodes.add(task);
if (task.getState().isActiveState()) {
this.countActiveTasks++;
}
}
private synchronized void recountActiveTasks() {
this.countActiveTasks = 0;
for (JobTask t : nodes) {
if (t.getState().isActiveState()) {
this.countActiveTasks++;
}
}
}
@Override
public synchronized void setTasks(List<JobTask> tasks) {
this.nodes = Lists.newArrayList(tasks);
recountActiveTasks();
}
public synchronized int getCountActiveTasks() {
return countActiveTasks;
}
@Override
public JobQueryConfig getQueryConfig() {
return queryConfig;
}
@Override
public void setQueryConfig(JobQueryConfig queryConfig) {
this.queryConfig = queryConfig;
}
@Override public JSONObject toJSON() throws Exception {
recountActiveTasks();
return CodecJSON.encodeJSON(this);
}
@Override
public String toString() {
try {
return CodecJSON.encodeString(this);
} catch (Exception e) {
return super.toString();
}
}
@Override
public int compareTo(IJob o) {
return getSubmitTime() > o.getSubmitTime() ? 1 : -1;
}
public synchronized boolean setTaskState(JobTask task, JobTaskState newState) {
return setTaskState(task, newState, false);
}
/**
* Change a task's state, and update the job's state if appropriate
* @param task The task to modify
* @param newState The new state to set
* @param force Whether to force the state transition regardless of the expected transition map
* @return True on success
*/
public synchronized boolean setTaskState(JobTask task, JobTaskState newState, boolean force) {
JobTaskState prevState = task.getState();
if (!task.setState(newState, force)) {
return false;
}
if (prevState.isActiveState() && !newState.isActiveState()) {
this.countActiveTasks--;
} else if (!prevState.isActiveState() && newState.isActiveState()) {
this.countActiveTasks++;
}
if (newState == JobTaskState.ERROR) {
this.disabled = true;
}
calculateJobState(force);
return true;
}
/**
* Calculate the job state based on the state of its tasks
*/
private boolean calculateJobState(boolean force) {
boolean err = false, sched = false, run = false, reb = false, stopped = false;
for (JobTask t : nodes) {
if (t.getWasStopped()) {
stopped = true;
}
if (t.getState() == JobTaskState.REBALANCE) {
reb = true;
} else if (t.isRunning()) {
run = true;
} else if (t.getState() == JobTaskState.ALLOCATED || t.getState().isQueuedState()) {
sched = true;
} else if (t.getState() == JobTaskState.ERROR) {
err = true;
break;
}
}
JobState oldJobState = getState();
JobState nextState = (err) ? JobState.ERROR : (reb) ? JobState.REBALANCE : (run) ? JobState.RUNNING : (sched) ? JobState.SCHEDULED : JobState.IDLE;
if (setState(nextState, force)) {
// If transitioning from error to non-error state, enable job as long as it has run recently.
if (oldJobState == JobState.ERROR && nextState != JobState.ERROR && getSubmitTime() != null && System.currentTimeMillis() - getSubmitTime() < AUTO_ENABLE_CUTOFF) {
setEnabled(true);
}
wasStopped = stopped;
return true;
} else {
return false;
}
}
public void errorTask(JobTask task, int errorCode) {
setTaskState(task, JobTaskState.ERROR, true);
task.setErrorCode(errorCode);
}
/**
* Check whether all tasks of a job are idle/errored/rebalancing. If a job is kicked, and isFinished evaluates to true,
* then it can be assumed that every task ran at least once, regardless of whether any rebalancing was started in the mean time.
* @return True if the job is finished
*/
public boolean isFinished() {
for (JobTask jobTask : getCopyOfTasks()) {
if (!taskStatesToFinishJob.contains(jobTask.getState())) {
return false;
}
}
return true;
}
@Override
public boolean getDontAutoBalanceMe() {
return dontAutoBalanceMe;
}
@Override
public void setDontDeleteMe(boolean dontDeleteMe) { this.dontDeleteMe = dontDeleteMe; }
@Override
public boolean getDontDeleteMe() {
return dontDeleteMe;
}
@Override
public void setDontCloneMe(boolean dontCloneMe) { this.dontCloneMe = dontCloneMe; }
@Override
public boolean getDontCloneMe() {
return dontCloneMe;
}
@Override
public void setDontAutoBalanceMe(boolean dontAutoBalanceMe) {
this.dontAutoBalanceMe = dontAutoBalanceMe;
}
@Override
public int getHourlyBackups() {
return hourlyBackups;
}
@Override
public int getDailyBackups() {
return dailyBackups;
}
@Override
public int getWeeklyBackups() {
return weeklyBackups;
}
@Override
public int getMonthlyBackups() {
return monthlyBackups;
}
@Override
public void setHourlyBackups(int hourlyBackups) {
this.hourlyBackups = hourlyBackups;
}
@Override
public void setDailyBackups(int dailyBackups) {
this.dailyBackups = dailyBackups;
}
@Override
public void setWeeklyBackups(int weeklyBackups) {
this.weeklyBackups = weeklyBackups;
}
@Override
public void setMonthlyBackups(int monthlyBackups) {
this.monthlyBackups = monthlyBackups;
}
@Override
public boolean getWasStopped() {
return wasStopped;
}
@Override
public void setWasStopped(boolean wasStopped) {
this.wasStopped = wasStopped;
}
public void setTaskFinished(JobTask task) {
int preFailErrorCode = task.getPreFailErrorCode();
int oldErrorCode = task.getErrorCode();
if (task.getState() == JobTaskState.REPLICATE || task.getState() == JobTaskState.BACKUP) {
if (preFailErrorCode > 0) {
// Restore the old job error if it existed
errorTask(task, preFailErrorCode);
return;
}
}
task.setErrorCode(0);
setTaskState(task, JobTaskState.IDLE, true);
if (getState() == JobState.IDLE) {
setEndTime(JitterClock.globalTime());
}
if (countErrorTasks() == 0 && oldErrorCode == JobTaskErrorCode.EXIT_REPLICATE_FAILURE || oldErrorCode == JobTaskErrorCode.EXIT_BACKUP_FAILURE) {
// If the job is disabled because this task failed to replicate, enable it.
log.warn("Enabling job " + getId() + " because the last replicate/backup error was resolved");
disabled = false;
}
}
@Override
public int getMaxSimulRunning() {
return maxSimulRunning;
}
@Override
public void setMaxSimulRunning(int maxSimulRunning) {
this.maxSimulRunning = maxSimulRunning;
}
@Override
public boolean getAutoRetry() {
return autoRetry;
}
@Override
public void setAutoRetry(boolean autoRetry) {
this.autoRetry = autoRetry;
}
@Override public boolean getBasicAlerts() {
return this.basicAlerts;
}
@Override public void setBasicAlerts(boolean basicAlerts) {
this.basicAlerts = basicAlerts;
}
@Override public boolean getBasicPages() {
return this.basicPages;
}
@Override public void setBasicPages(boolean basicPages) {
this.basicPages = basicPages;
}
private int countErrorTasks() {
int count = 0;
List<JobTask> tasks = getCopyOfTasks();
if (tasks == null) {
return count;
}
for (JobTask task : tasks) {
if (task != null && task.getState() == JobTaskState.ERROR) {
count++;
}
}
return count;
}
public long calcAverageTaskSizeBytes() {
List<JobTask> tasks = getCopyOfTasks();
if (tasks == null || tasks.size() <= 0) {
return 0;
}
long rv = 0;
for (JobTask task : tasks) {
if (task != null) {
rv += task.getByteCount();
}
}
return rv / (tasks.size());
}
@Override
public String getMinionType() {
if (minionType == null) {
minionType = Minion.defaultMinionType;
}
return minionType;
}
public Long getCanonicalTime() {
// Get an estimate for the last time this job was run. Use end-time if non-null; otherwise, startTime.
return (endTime == null && getState() == JobState.IDLE) ? startTime : endTime;
}
@Override
public void setMinionType(String minionType) {
this.minionType = minionType;
}
public boolean shouldAutoRekick(long clock) {
Long canonicalTime = getCanonicalTime();
return isEnabled() && canonicalTime != null && getRunCount() > 0 && getRekickTimeout() != null &&
getRekickTimeout() > 0 && clock - canonicalTime >= (getRekickTimeout() * 60000L);
}
/**
* Log a job event to a rolling log file
*/
public static void logJobEvent(@Nullable Job job, JobEvent event, RollingLog eventLog) {
LogUtil.log(eventLog, log, new StringMapHelper()
.put("event", event)
.put("time", System.currentTimeMillis())
.put("jobid", job.getId())
.put("creator", job.getCreator())
.put("owner", job.getOwner())
.put("createTime", job.getCreateTime())
.put("priority", job.getPriority())
.put("replicas", job.getReplicas())
.put("runCount", job.getRunCount())
.put("state", job.getState())
.put("taskCount", job.getTaskCount())
.put("avgTaskSize", job.calcAverageTaskSizeBytes())
.put("startTime", job.getStartTime())
.put("endTime", job.getEndTime())
.put("submitTime", job.getSubmitTime())
.put("command", job.getCommand()));
}
}