package com.linkedin.thirdeye.completeness.checker;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import org.apache.commons.collections.CollectionUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.linkedin.pinot.client.ResultSet;
import com.linkedin.pinot.client.ResultSetGroup;
import com.linkedin.thirdeye.api.TimeSpec;
import com.linkedin.thirdeye.client.ThirdEyeCacheRegistry;
import com.linkedin.thirdeye.client.pinot.PinotQuery;
/**
* Util methods for data completeness
*/
public class DataCompletenessUtils {
private static final Logger LOG = LoggerFactory.getLogger(DataCompletenessUtils.class);
private static final ThirdEyeCacheRegistry CACHE_REGISTRY = ThirdEyeCacheRegistry.getInstance();
private static final String DAY_FORMAT = "yyyyMMdd";
private static final String HOUR_FORMAT = "yyyyMMddHH";
private static final String MINUTE_FORMAT = "yyyyMMddHHmm";
/**
* All MINUTE granularity data will be rounded off to 30 MINUTES by default
*/
private static final int MINUTES_LEVEL_ROUNDING = 30;
// HELPER methods for DataCompletenessTaskRunner
/**
* round this time to earlier boundary, depending on granularity of dataset
* e.g. 12:15pm on HOURLY dataset should be treated as 12pm
* and 12:50pm on any MINUTE level dataset should be treated as 12:30pm
* @param timeSpec
* @param dataCompletenessStartTime
* @param dateTimeZone
* @return
*/
public static long getAdjustedTimeForDataset(TimeSpec timeSpec, long dataCompletenessStartTime, DateTimeZone zone) {
DateTime adjustedDateTime = new DateTime(dataCompletenessStartTime, zone);
TimeUnit unit = timeSpec.getDataGranularity().getUnit();
switch (unit) {
case DAYS:
adjustedDateTime = adjustedDateTime.withTimeAtStartOfDay();
break;
case MINUTES:
int roundedMinutes = (adjustedDateTime.getMinuteOfHour()/MINUTES_LEVEL_ROUNDING) * MINUTES_LEVEL_ROUNDING;
adjustedDateTime = adjustedDateTime.withTime(adjustedDateTime.getHourOfDay(), roundedMinutes, 0, 0);
break;
case HOURS:
default:
adjustedDateTime = adjustedDateTime.withTime(adjustedDateTime.getHourOfDay(), 0, 0, 0);
break;
}
return adjustedDateTime.getMillis();
}
/**
* get bucket size in millis, according to data granularity of dataset
* Bucket size are 1 HOUR for hourly, 1 DAY for daily and 30 MINUTES for minute level
* @param timeSpec
* @return
*/
public static long getBucketSizeInMSForDataset(TimeSpec timeSpec) {
long bucketMillis = 0;
TimeUnit unit = timeSpec.getDataGranularity().getUnit();
switch (unit) {
case DAYS:
bucketMillis = TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS);
break;
case MINUTES:
bucketMillis = TimeUnit.MILLISECONDS.convert(MINUTES_LEVEL_ROUNDING, TimeUnit.MINUTES);
break;
case HOURS:
default:
bucketMillis = TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
break;
}
return bucketMillis;
}
/**
* Get date time formatter according to granularity of dataset
* This is to store the date in the db, in the correct SDF
* @param timeSpec
* @return
*/
public static DateTimeFormatter getDateTimeFormatterForDataset(TimeSpec timeSpec, DateTimeZone zone) {
String pattern = null;
TimeUnit unit = timeSpec.getDataGranularity().getUnit();
switch (unit) {
case DAYS:
pattern = DAY_FORMAT;
break;
case MINUTES:
pattern = MINUTE_FORMAT;
break;
case HOURS:
default:
pattern = HOUR_FORMAT;
break;
}
DateTimeFormatter dateTimeFormatter = DateTimeFormat.forPattern(pattern).withZone(zone);
return dateTimeFormatter;
}
/**
* Get time values which correspond to time column in the pinot segment, for the given buckets in millis
* Knowing millis time or sdf is not sufficient for querying to pinot, as pinot might be storing in sinceEpoch format
* @param timeSpec
* @param bucketNameToBucketValue
* @return
*/
public static ListMultimap<String, Long> getBucketNameToTimeValuesMap(TimeSpec timeSpec,
Map<String, Long> bucketNameToBucketValue) {
ListMultimap<String, Long> bucketNameToTimeValues = ArrayListMultimap.create();
String timeFormat = timeSpec.getFormat();
if (timeFormat.equals(TimeSpec.SINCE_EPOCH_FORMAT)) {
TimeUnit unit = timeSpec.getDataGranularity().getUnit();
int timeDuration = timeSpec.getDataGranularity().getSize();
for (Entry<String, Long> entry : bucketNameToBucketValue.entrySet()) {
String bucketName = entry.getKey();
Long bucketValue = entry.getValue();
long timeValue = 0;
switch (unit) {
case MINUTES:
for (int i = 0; i < MINUTES_LEVEL_ROUNDING/timeDuration; i++) {
timeValue = TimeUnit.MINUTES.convert(bucketValue, TimeUnit.MILLISECONDS) / timeDuration;
bucketNameToTimeValues.put(bucketName, timeValue);
bucketValue = bucketValue + TimeUnit.MILLISECONDS.convert(timeDuration, TimeUnit.MINUTES);
}
break;
case DAYS:
timeValue = TimeUnit.DAYS.convert(bucketValue, TimeUnit.MILLISECONDS);
bucketNameToTimeValues.put(bucketName, timeValue);
break;
case HOURS:
default:
timeValue = TimeUnit.HOURS.convert(bucketValue, TimeUnit.MILLISECONDS);
bucketNameToTimeValues.put(bucketName, timeValue);
break;
}
}
} else {
for (Entry<String, Long> entry : bucketNameToBucketValue.entrySet()) {
String bucketName = entry.getKey();
bucketNameToTimeValues.put(bucketName, Long.valueOf(bucketName));
}
}
return bucketNameToTimeValues;
}
/**
* Get count * of buckets
* @param dataset
* @param bucketNameToBucketValueMS
* @param bucketNameToBucketValue
* @return
*/
public static Map<String, Long> getCountsForBucketsOfDataset(String dataset, TimeSpec timeSpec,
Map<String, Long> bucketNameToBucketValueMS) {
// get time values according to dataset timeSpec schema (epoch or sdf values in proper granularity)
// dateToCheckInSDF -> timeValues as present in segments
// This is a multimap because for nMinutesSinceEpoch, a bucket (30 minutes) can have more than 1 time values in the 30 minutes
// e.g.: For 5 minutes granularity data, the checker will round to 30 minutes,
// but count * should be taken from 6 time values in that 30 minutes
ListMultimap<String, Long> bucketNameToTimeValues = getBucketNameToTimeValuesMap(timeSpec, bucketNameToBucketValueMS);
LOG.info("Bucket name to time values {}", bucketNameToTimeValues);
Map<String, Long> bucketNameToCountStarMap = getBucketNameToCountStarMap(dataset, timeSpec, bucketNameToTimeValues);
return bucketNameToCountStarMap;
}
private static Map<String, Long> getBucketNameToCountStarMap(String dataset, TimeSpec timeSpec,
ListMultimap<String, Long> bucketNameToTimeValues) {
Map<String, Long> bucketNameToCountStar = new HashMap<>();
// generate request
StringBuilder sb = new StringBuilder();
String delimiter = "";
for (Long timeValue : bucketNameToTimeValues.values()) {
sb.append(delimiter);
delimiter = " OR ";
sb.append(String.format("%s='%s'", timeSpec.getColumnName(), timeValue));
}
long top = bucketNameToTimeValues.values().size();
String pql = String.format("select count(*) from %s where %s group by %s top %s",
dataset, sb.toString(), timeSpec.getColumnName(), top);
Map<Long, Long> timeValueToCount = new HashMap<>();
try {
ResultSetGroup resultSetGroup = CACHE_REGISTRY.getResultSetGroupCache().get(new PinotQuery(pql, dataset));
if (resultSetGroup == null || resultSetGroup.getResultSetCount() <= 0) {
return bucketNameToCountStar;
}
ResultSet resultSet = resultSetGroup.getResultSet(0);
for (int i = 0; i < resultSet.getRowCount(); i++) {
Long timeValue = Long.valueOf(resultSet.getGroupKeyString(i, 0));
Long count = resultSet.getLong(i, 0);
timeValueToCount.put(timeValue, count);
}
} catch (ExecutionException e) {
LOG.error("Exception in getting count *. PQL:{}", pql, e);
}
// parse response to get counts
for (String bucketName : bucketNameToTimeValues.keySet()) {
List<Long> timeValues = bucketNameToTimeValues.get(bucketName);
Long sumOfCountForBucket = 0L;
for (Long timeValue : timeValues) {
long val = 0L;
if(timeValueToCount.containsKey(timeValue)) {
val = timeValueToCount.get(timeValue);
}
sumOfCountForBucket = sumOfCountForBucket + val;
}
bucketNameToCountStar.put(bucketName, sumOfCountForBucket);
}
return bucketNameToCountStar;
}
public static double getPercentCompleteness(PercentCompletenessFunctionInput input) {
DataCompletenessConstants.DataCompletenessAlgorithmName algorithm = input.getAlgorithm();
List<Long> baselineCounts = input.getBaselineCounts();
Long currentCount = input.getCurrentCount();
double percentCompleteness = 0;
double baselineTotalCount = 0;
if (CollectionUtils.isNotEmpty(baselineCounts)) {
switch (algorithm) {
case WO4W_AVERAGE:
default:
for (Long baseline : baselineCounts) {
baselineTotalCount = baselineTotalCount + baseline;
}
baselineTotalCount = baselineTotalCount/baselineCounts.size();
break;
}
}
if (baselineTotalCount != 0) {
percentCompleteness = new Double(currentCount * 100) / baselineTotalCount;
}
if (baselineTotalCount == 0 && currentCount != 0) {
percentCompleteness = 100;
}
return percentCompleteness;
}
}