DetectionJobSchedulerUtils.java example

Explorer
pinot-master
package com.linkedin.thirdeye.anomaly.detection;

import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import com.linkedin.thirdeye.api.TimeGranularity;
import com.linkedin.thirdeye.api.TimeSpec;
import com.linkedin.thirdeye.completeness.checker.DataCompletenessUtils;
import com.linkedin.thirdeye.datalayer.dto.AnomalyFunctionDTO;
import com.linkedin.thirdeye.datalayer.dto.DatasetConfigDTO;
import com.linkedin.thirdeye.datalayer.dto.DetectionStatusDTO;
import com.linkedin.thirdeye.util.ThirdEyeUtils;

public class DetectionJobSchedulerUtils {

  private static final String DAY_FORMAT = "yyyyMMdd";
  private static final String HOUR_FORMAT = "yyyyMMddHH";
  private static final String MINUTE_FORMAT = "yyyyMMddHHmm";

  /**
   * Get date time formatter according to granularity of dataset
   * This is to store the date in the db, in the correct SDF
   * @return
   */
  public static DateTimeFormatter getDateTimeFormatterForDataset(
      DatasetConfigDTO datasetConfig, DateTimeZone dateTimeZone) {
    String pattern = null;
    TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
    TimeUnit unit = timeSpec.getDataGranularity().getUnit();
    switch (unit) {
      case DAYS:
        pattern = DAY_FORMAT;
        break;
      case MINUTES:
      case SECONDS:
      case MILLISECONDS:
        pattern = MINUTE_FORMAT;
        break;
      case HOURS:
      default:
        pattern = HOUR_FORMAT;
        break;
    }
    DateTimeFormatter dateTimeFormatter = DateTimeFormat.forPattern(pattern).withZone(dateTimeZone);
    return dateTimeFormatter;
  }

  /**
   * round this time to earlier boundary, depending on granularity of dataset
   * e.g. 12:15pm on HOURLY dataset should be treated as 12pm
   * any dataset with granularity finer than HOUR, will be rounded as per function frequency (assumption is that this is in MINUTES)
   * so 12.53 on 5 MINUTES dataset, with function frequency 15 MINUTES will be rounded to 12.45
   * @param anomalyFunction
   * @return
   */
  public static long getBoundaryAlignedTimeForDataset(DatasetConfigDTO datasetConfig, DateTime dateTime,
      AnomalyFunctionDTO anomalyFunction) {
    TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
    TimeUnit dataUnit = timeSpec.getDataGranularity().getUnit();
    TimeGranularity functionFrequency = anomalyFunction.getFrequency();

    // For nMINUTE level datasets, with frequency defined in nMINUTES in the function, (make sure size doesnt exceed 30 minutes, just use 1 HOUR in that case)
    // Calculate time periods according to the function frequency
    if (dataUnit.equals(TimeUnit.MINUTES) || dataUnit.equals(TimeUnit.MILLISECONDS) || dataUnit.equals(TimeUnit.SECONDS)) {
      if (functionFrequency.getUnit().equals(TimeUnit.MINUTES) && (functionFrequency.getSize() <=30)) {
        int minuteBucketSize = functionFrequency.getSize();
        int roundedMinutes = (dateTime.getMinuteOfHour()/minuteBucketSize) * minuteBucketSize;
        dateTime = dateTime.withTime(dateTime.getHourOfDay(), roundedMinutes, 0, 0);
      } else {
        dateTime = getBoundaryAlignedTimeForDataset(dateTime, TimeUnit.HOURS); // default to HOURS
      }
    } else {
      dateTime = getBoundaryAlignedTimeForDataset(dateTime, dataUnit);
    }

    return dateTime.getMillis();
  }

  private static DateTime getBoundaryAlignedTimeForDataset(DateTime dateTime, TimeUnit unit) {
    switch (unit) {
      case DAYS:
        dateTime = dateTime.withTimeAtStartOfDay();
        break;
      case HOURS:
      default:
        dateTime = dateTime.withTime(dateTime.getHourOfDay(), 0, 0, 0);
        break;
    }
    return dateTime;
  }

  /**
   * get bucket size in millis, according to data granularity of dataset
   * Bucket size are 1 HOUR for hourly, 1 DAY for daily
   * For MINUTE level data, bucket size is calculated based on anomaly function frequency
   * @return
   */
  public static Period getBucketSizePeriodForDataset(DatasetConfigDTO datasetConfig, AnomalyFunctionDTO anomalyFunction) {
    Period bucketSizePeriod = null;
    TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
    TimeUnit dataUnit = timeSpec.getDataGranularity().getUnit();
    TimeGranularity functionFrequency = anomalyFunction.getFrequency();

    // For nMINUTE level datasets, with frequency defined in nMINUTES in the function, (make sure size doesnt exceed 30 minutes, just use 1 HOUR in that case)
    // Calculate time periods according to the function frequency
    if (dataUnit.equals(TimeUnit.MINUTES) || dataUnit.equals(TimeUnit.MILLISECONDS) || dataUnit.equals(TimeUnit.SECONDS)) {
      if (functionFrequency.getUnit().equals(TimeUnit.MINUTES) && (functionFrequency.getSize() <=30)) {
        bucketSizePeriod = new Period(0, 0, 0, 0, 0, functionFrequency.getSize(), 0, 0);
      } else {
        bucketSizePeriod = getBucketSizePeriodForUnit(TimeUnit.HOURS); // default to 1 HOUR
      }
    } else {
      bucketSizePeriod = getBucketSizePeriodForUnit(dataUnit);
    }
    return bucketSizePeriod;
  }

  private static Period getBucketSizePeriodForUnit(TimeUnit unit) {
    Period bucketSizePeriod = null;
    switch (unit) {
      case DAYS:
        bucketSizePeriod = new Period(0, 0, 0, 1, 0, 0, 0, 0); // 1 DAY
        break;
      case HOURS:
      default:
        bucketSizePeriod = new Period(0, 0, 0, 0, 1, 0, 0, 0); //   1 HOUR
        break;
    }
    return bucketSizePeriod;
  }



  /**
   * Create new entries from last entry to current time,
   * according to time granularity of dataset in case of HOURLY/DAILY,
   * and according to time granularity of function frequency in case of MINUTE level data
   * If it is an HOURLY dataset, run detection for every HOUR
   * If it is a DAILY dataset, run detection for every DAY
   * If it is an n MINUTE level dataset, run detection for every bucket, determined by the frequency field in anomaly function
   *
   * @param currentDateTime
   * @param lastEntryForFunction
   * @param anomalyFunction
   * @param datasetConfig
   * @param dateTimeZone
   * @return
   */
  public static Map<String, Long> getNewEntries(DateTime currentDateTime, DetectionStatusDTO lastEntryForFunction,
      AnomalyFunctionDTO anomalyFunction, DatasetConfigDTO datasetConfig, DateTimeZone dateTimeZone) {

    Map<String, Long> newEntries = new LinkedHashMap<>();

    // get current hour/day, depending on granularity of dataset,
    DateTimeFormatter dateTimeFormatterForDataset = DetectionJobSchedulerUtils.
        getDateTimeFormatterForDataset(datasetConfig, dateTimeZone);

    long alignedCurrentMillis =
        DetectionJobSchedulerUtils.getBoundaryAlignedTimeForDataset(datasetConfig, currentDateTime, anomalyFunction);
    DateTime alignedDateTime = new DateTime(alignedCurrentMillis, dateTimeZone);

    // if first ever entry, create it with current time
    if (lastEntryForFunction == null) {
      String currentDateString = dateTimeFormatterForDataset.print(alignedDateTime);
      newEntries.put(currentDateString, dateTimeFormatterForDataset.parseMillis(currentDateString));
    } else { // else create all entries from last entry onwards to current time
      DateTime lastDateTime = new DateTime(lastEntryForFunction.getDateToCheckInMS(), dateTimeZone);
      Period bucketSizePeriod = DetectionJobSchedulerUtils.getBucketSizePeriodForDataset(datasetConfig, anomalyFunction);
      while (lastDateTime.isBefore(alignedDateTime)) {
        lastDateTime = lastDateTime.plus(bucketSizePeriod);
        newEntries.put(dateTimeFormatterForDataset.print(lastDateTime), lastDateTime.getMillis());
      }
    }
    return newEntries;
  }


  /**
   * Creates job name for anomaly detection job
   * @param anomalyFunction
   * @param startTimes
   * @param endTimes
   * @return
   */
  public static String createJobName(AnomalyFunctionDTO anomalyFunction, List<Long> startTimes, List<Long> endTimes) {
    return String.format("%s-%s-%s-%s-%d", anomalyFunction.getId(), anomalyFunction.getFunctionName(),
        startTimes.get(0), endTimes.get(0), startTimes.size());
  }

  /**
   * Calculates the number of buckets that a time period can be divided into, depending on the dataset and function frequency
   * @param datasetConfig
   * @param startTime
   * @param endTime
   * @return
   */
  public static long getExpectedCompleteBuckets(DatasetConfigDTO datasetConfig, long startTime, long endTime) {
    TimeSpec timeSpec = ThirdEyeUtils.getTimeSpecFromDatasetConfig(datasetConfig);
    // Get this from DataCompletenessUtils because that determines number of buckets to check
    long bucketSize = DataCompletenessUtils.getBucketSizeInMSForDataset(timeSpec);
    long numBuckets = (endTime - startTime)/bucketSize;
    return numBuckets;
  }

}