package com.linkedin.thirdeye.anomaly.detection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import com.linkedin.thirdeye.api.TimeGranularity;
import com.linkedin.thirdeye.api.TimeSpec;
import com.linkedin.thirdeye.completeness.checker.DataCompletenessUtils;
import com.linkedin.thirdeye.datalayer.dto.AnomalyFunctionDTO;
import com.linkedin.thirdeye.datalayer.dto.DatasetConfigDTO;
import com.linkedin.thirdeye.datalayer.dto.DetectionStatusDTO;
import com.linkedin.thirdeye.util.ThirdEyeUtils;
public class DetectionJobSchedulerUtils {
private static final String DAY_FORMAT = "yyyyMMdd";
private static final String HOUR_FORMAT = "yyyyMMddHH";
private static final String MINUTE_FORMAT = "yyyyMMddHHmm";
/**
* Get date time formatter according to granularity of dataset
* This is to store the date in the db, in the correct SDF
* @return
*/
public static DateTimeFormatter getDateTimeFormatterForDataset(
DatasetConfigDTO datasetConfig, DateTimeZone dateTimeZone) {
String pattern = null;
TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
TimeUnit unit = timeSpec.getDataGranularity().getUnit();
switch (unit) {
case DAYS:
pattern = DAY_FORMAT;
break;
case MINUTES:
case SECONDS:
case MILLISECONDS:
pattern = MINUTE_FORMAT;
break;
case HOURS:
default:
pattern = HOUR_FORMAT;
break;
}
DateTimeFormatter dateTimeFormatter = DateTimeFormat.forPattern(pattern).withZone(dateTimeZone);
return dateTimeFormatter;
}
/**
* round this time to earlier boundary, depending on granularity of dataset
* e.g. 12:15pm on HOURLY dataset should be treated as 12pm
* any dataset with granularity finer than HOUR, will be rounded as per function frequency (assumption is that this is in MINUTES)
* so 12.53 on 5 MINUTES dataset, with function frequency 15 MINUTES will be rounded to 12.45
* @param anomalyFunction
* @return
*/
public static long getBoundaryAlignedTimeForDataset(DatasetConfigDTO datasetConfig, DateTime dateTime,
AnomalyFunctionDTO anomalyFunction) {
TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
TimeUnit dataUnit = timeSpec.getDataGranularity().getUnit();
TimeGranularity functionFrequency = anomalyFunction.getFrequency();
// For nMINUTE level datasets, with frequency defined in nMINUTES in the function, (make sure size doesnt exceed 30 minutes, just use 1 HOUR in that case)
// Calculate time periods according to the function frequency
if (dataUnit.equals(TimeUnit.MINUTES) || dataUnit.equals(TimeUnit.MILLISECONDS) || dataUnit.equals(TimeUnit.SECONDS)) {
if (functionFrequency.getUnit().equals(TimeUnit.MINUTES) && (functionFrequency.getSize() <=30)) {
int minuteBucketSize = functionFrequency.getSize();
int roundedMinutes = (dateTime.getMinuteOfHour()/minuteBucketSize) * minuteBucketSize;
dateTime = dateTime.withTime(dateTime.getHourOfDay(), roundedMinutes, 0, 0);
} else {
dateTime = getBoundaryAlignedTimeForDataset(dateTime, TimeUnit.HOURS); // default to HOURS
}
} else {
dateTime = getBoundaryAlignedTimeForDataset(dateTime, dataUnit);
}
return dateTime.getMillis();
}
private static DateTime getBoundaryAlignedTimeForDataset(DateTime dateTime, TimeUnit unit) {
switch (unit) {
case DAYS:
dateTime = dateTime.withTimeAtStartOfDay();
break;
case HOURS:
default:
dateTime = dateTime.withTime(dateTime.getHourOfDay(), 0, 0, 0);
break;
}
return dateTime;
}
/**
* get bucket size in millis, according to data granularity of dataset
* Bucket size are 1 HOUR for hourly, 1 DAY for daily
* For MINUTE level data, bucket size is calculated based on anomaly function frequency
* @return
*/
public static Period getBucketSizePeriodForDataset(DatasetConfigDTO datasetConfig, AnomalyFunctionDTO anomalyFunction) {
Period bucketSizePeriod = null;
TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
TimeUnit dataUnit = timeSpec.getDataGranularity().getUnit();
TimeGranularity functionFrequency = anomalyFunction.getFrequency();
// For nMINUTE level datasets, with frequency defined in nMINUTES in the function, (make sure size doesnt exceed 30 minutes, just use 1 HOUR in that case)
// Calculate time periods according to the function frequency
if (dataUnit.equals(TimeUnit.MINUTES) || dataUnit.equals(TimeUnit.MILLISECONDS) || dataUnit.equals(TimeUnit.SECONDS)) {
if (functionFrequency.getUnit().equals(TimeUnit.MINUTES) && (functionFrequency.getSize() <=30)) {
bucketSizePeriod = new Period(0, 0, 0, 0, 0, functionFrequency.getSize(), 0, 0);
} else {
bucketSizePeriod = getBucketSizePeriodForUnit(TimeUnit.HOURS); // default to 1 HOUR
}
} else {
bucketSizePeriod = getBucketSizePeriodForUnit(dataUnit);
}
return bucketSizePeriod;
}
private static Period getBucketSizePeriodForUnit(TimeUnit unit) {
Period bucketSizePeriod = null;
switch (unit) {
case DAYS:
bucketSizePeriod = new Period(0, 0, 0, 1, 0, 0, 0, 0); // 1 DAY
break;
case HOURS:
default:
bucketSizePeriod = new Period(0, 0, 0, 0, 1, 0, 0, 0); // 1 HOUR
break;
}
return bucketSizePeriod;
}
/**
* Create new entries from last entry to current time,
* according to time granularity of dataset in case of HOURLY/DAILY,
* and according to time granularity of function frequency in case of MINUTE level data
* If it is an HOURLY dataset, run detection for every HOUR
* If it is a DAILY dataset, run detection for every DAY
* If it is an n MINUTE level dataset, run detection for every bucket, determined by the frequency field in anomaly function
*
* @param currentDateTime
* @param lastEntryForFunction
* @param anomalyFunction
* @param datasetConfig
* @param dateTimeZone
* @return
*/
public static Map<String, Long> getNewEntries(DateTime currentDateTime, DetectionStatusDTO lastEntryForFunction,
AnomalyFunctionDTO anomalyFunction, DatasetConfigDTO datasetConfig, DateTimeZone dateTimeZone) {
Map<String, Long> newEntries = new LinkedHashMap<>();
// get current hour/day, depending on granularity of dataset,
DateTimeFormatter dateTimeFormatterForDataset = DetectionJobSchedulerUtils.
getDateTimeFormatterForDataset(datasetConfig, dateTimeZone);
long alignedCurrentMillis =
DetectionJobSchedulerUtils.getBoundaryAlignedTimeForDataset(datasetConfig, currentDateTime, anomalyFunction);
DateTime alignedDateTime = new DateTime(alignedCurrentMillis, dateTimeZone);
// if first ever entry, create it with current time
if (lastEntryForFunction == null) {
String currentDateString = dateTimeFormatterForDataset.print(alignedDateTime);
newEntries.put(currentDateString, dateTimeFormatterForDataset.parseMillis(currentDateString));
} else { // else create all entries from last entry onwards to current time
DateTime lastDateTime = new DateTime(lastEntryForFunction.getDateToCheckInMS(), dateTimeZone);
Period bucketSizePeriod = DetectionJobSchedulerUtils.getBucketSizePeriodForDataset(datasetConfig, anomalyFunction);
while (lastDateTime.isBefore(alignedDateTime)) {
lastDateTime = lastDateTime.plus(bucketSizePeriod);
newEntries.put(dateTimeFormatterForDataset.print(lastDateTime), lastDateTime.getMillis());
}
}
return newEntries;
}
/**
* Creates job name for anomaly detection job
* @param anomalyFunction
* @param startTimes
* @param endTimes
* @return
*/
public static String createJobName(AnomalyFunctionDTO anomalyFunction, List<Long> startTimes, List<Long> endTimes) {
return String.format("%s-%s-%s-%s-%d", anomalyFunction.getId(), anomalyFunction.getFunctionName(),
startTimes.get(0), endTimes.get(0), startTimes.size());
}
/**
* Calculates the number of buckets that a time period can be divided into, depending on the dataset and function frequency
* @param datasetConfig
* @param startTime
* @param endTime
* @return
*/
public static long getExpectedCompleteBuckets(DatasetConfigDTO datasetConfig, long startTime, long endTime) {
TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
// Get this from DataCompletenessUtils because that determines number of buckets to check
long bucketSize = DataCompletenessUtils.getBucketSizeInMSForDataset(timeSpec);
long numBuckets = (endTime - startTime)/bucketSize;
return numBuckets;
}
}