/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.cli.command; import co.cask.cdap.api.annotation.Beta; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.cli.ArgumentName; import co.cask.cdap.cli.CLIConfig; import co.cask.cdap.cli.ElementType; import co.cask.cdap.cli.english.Article; import co.cask.cdap.cli.english.Fragment; import co.cask.cdap.cli.util.AbstractCommand; import co.cask.cdap.client.QueryClient; import co.cask.cdap.client.StreamClient; import co.cask.cdap.explore.client.ExploreExecutionResult; import co.cask.cdap.proto.ColumnDesc; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.QueryResult; import co.cask.cdap.proto.StreamProperties; import co.cask.common.cli.Arguments; import com.google.common.base.Strings; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Multiset; import com.google.common.collect.Sets; import com.google.common.util.concurrent.ListenableFuture; import com.google.inject.Inject; import java.io.PrintStream; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; /** * A CLI command for getting statistics about stream events. */ @Beta public class GetStreamStatsCommand extends AbstractCommand { private static final int DEFAULT_LIMIT = 100; private static final int MAX_LIMIT = 100000; private final StreamClient streamClient; private final QueryClient queryClient; @Inject public GetStreamStatsCommand(StreamClient streamClient, QueryClient queryClient, CLIConfig cliConfig) { super(cliConfig); this.streamClient = streamClient; this.queryClient = queryClient; } @Override public void perform(Arguments arguments, PrintStream output) throws Exception { long currentTime = System.currentTimeMillis(); Id.Stream streamId = Id.Stream.from(cliConfig.getCurrentNamespace(), arguments.get(ArgumentName.STREAM.toString())); // limit limit to [1, MAX_LIMIT] int limit = Math.max(1, Math.min(MAX_LIMIT, arguments.getInt(ArgumentName.LIMIT.toString(), DEFAULT_LIMIT))); long startTime = getTimestamp(arguments.get(ArgumentName.START_TIME.toString(), "min"), currentTime); long endTime = getTimestamp(arguments.get(ArgumentName.END_TIME.toString(), "max"), currentTime); // hack to validate streamId StreamProperties config = streamClient.getConfig(streamId); if (config.getFormat().getName().equals("text")) { output.printf("No schema found for stream '%s'", streamId.getId()); output.println(); return; } // build processorMap: Hive column name -> StatsProcessor Map<String, Set<StatsProcessor>> processorMap = Maps.newHashMap(); Schema streamSchema = config.getFormat().getSchema(); for (Schema.Field field : streamSchema.getFields()) { Schema fieldSchema = field.getSchema(); String hiveColumnName = cdapSchemaColumName2HiveColumnName(streamId, field.getName()); processorMap.put(hiveColumnName, getProcessorsForType(fieldSchema.getType(), fieldSchema.getUnionSchemas())); } // get a list of stream events and calculates various statistics about the events String timestampCol = getTimestampHiveColumn(streamId); ListenableFuture<ExploreExecutionResult> resultsFuture = queryClient.execute( streamId.getNamespace(), "SELECT * FROM " + getHiveTableName(streamId) + " WHERE " + timestampCol + " BETWEEN " + startTime + " AND " + endTime + " LIMIT " + limit); ExploreExecutionResult results = resultsFuture.get(1, TimeUnit.MINUTES); List<ColumnDesc> schema = results.getResultSchema(); // apply StatsProcessors to every element in every row int rows = 0; while (results.hasNext()) { rows++; QueryResult row = results.next(); for (int i = 0; i < row.getColumns().size(); i++) { Object column = row.getColumns().get(i); ColumnDesc columnDesc = schema.get(i); String columnName = columnDesc.getName(); if (isUserHiveColumn(streamId, columnName)) { Set<StatsProcessor> processors = processorMap.get(columnName); if (processors != null) { for (StatsProcessor processor : processors) { processor.process(column); } } } } } // print report for (ColumnDesc columnDesc : schema) { if (isUserHiveColumn(streamId, columnDesc.getName())) { String truncatedColumnName = getTruncatedColumnName(streamId, columnDesc.getName()); output.printf("column: %s, type: %s", truncatedColumnName, columnDesc.getType()); output.println(); Set<StatsProcessor> processors = processorMap.get(columnDesc.getName()); if (processors != null && !processors.isEmpty()) { for (StatsProcessor processor : processors) { processor.printReport(output); } output.println(); } else { output.println("No statistics available"); output.println(); } } } output.printf("Analyzed %d Stream events in the time range [%d, %d]...", rows, startTime, endTime); output.println(); output.println(); } private String getTruncatedColumnName(Id.Stream streamId, String hiveColumnName) { String hiveTableName = getHiveTableName(streamId); String hiveTablePrefix = hiveTableName + "."; if (hiveColumnName.startsWith(hiveTablePrefix)) { return hiveColumnName.substring(hiveTablePrefix.length()); } return hiveColumnName; } private String getTimestampHiveColumn(Id.Stream streamId) { return cdapSchemaColumName2HiveColumnName(streamId, "ts"); } private boolean isUserHiveColumn(Id.Stream streamId, String hiveColumName) { // TODO: hardcoded return !cdapSchemaColumName2HiveColumnName(streamId, "ts").equals(hiveColumName) && !cdapSchemaColumName2HiveColumnName(streamId, "headers").equals(hiveColumName); } private String getHiveTableName(Id.Stream streamId) { return String.format("stream_%s", streamId.getId()); } private String cdapSchemaColumName2HiveColumnName(Id.Stream streamId, String schemaColumName) { return (getHiveTableName(streamId) + "." + schemaColumName).toLowerCase(); } private Set<StatsProcessor> getProcessorsForType(Schema.Type type, List<Schema> unionSchemas) { ImmutableSet.Builder<StatsProcessor> result = ImmutableSet.builder(); boolean isBoolean = isTypeOrInUnion(Schema.Type.DOUBLE, type, unionSchemas); boolean isInt = isTypeOrInUnion(Schema.Type.INT, type, unionSchemas); boolean isLong = isTypeOrInUnion(Schema.Type.LONG, type, unionSchemas); boolean isFloat = isTypeOrInUnion(Schema.Type.FLOAT, type, unionSchemas); boolean isDouble = isTypeOrInUnion(Schema.Type.DOUBLE, type, unionSchemas); boolean isBytes = isTypeOrInUnion(Schema.Type.DOUBLE, type, unionSchemas); boolean isString = isTypeOrInUnion(Schema.Type.STRING, type, unionSchemas); if (isBoolean || isInt || isLong || isString || isFloat || isDouble || isBytes) { result.add(new CountUniqueProcessor()); } if (isInt || isLong || isFloat || isDouble) { result.add(new HistogramProcessor(cliConfig)); } return result.build(); } private boolean isTypeOrInUnion(Schema.Type desiredType, Schema.Type type, List<Schema> unionSchemas) { if (desiredType.equals(type)) { return true; } for (Schema unionSchema : unionSchemas) { if (desiredType == unionSchema.getType()) { return true; } } return false; } @Override public String getPattern() { return String.format("get stream-stats <%s> [limit <%s>] [start <%s>] [end <%s>]", ArgumentName.STREAM, ArgumentName.LIMIT, ArgumentName.START_TIME, ArgumentName.END_TIME); } @Override public String getDescription() { return String.format("Gets statistics for %s. The '<%s>' limits how many stream events to analyze; default is " + "%s. The time format for '<%s>' and '<%s>' can be a timestamp in " + "milliseconds or a relative time in the form of '[+|-][0-9][d|h|m|s]'. '<%s>' is relative to current time; " + "'<%s>' is relative to '<%s>'. Special constants 'min' and 'max' can be used to represent '0' and " + "'max timestamp' respectively.", Fragment.of(Article.A, ElementType.STREAM.getName()), ArgumentName.LIMIT, DEFAULT_LIMIT, ArgumentName.START_TIME, ArgumentName.END_TIME, ArgumentName.START_TIME, ArgumentName.END_TIME, ArgumentName.START_TIME); } /** * Processes elements within a Hive column and prints out a report about the elements visited. */ private interface StatsProcessor { void process(Object element); void printReport(PrintStream printStream); } /** * Reports the number of unique elements found. */ private static final class CountUniqueProcessor implements StatsProcessor { private final Set<Object> elements = Sets.newHashSet(); @Override public void process(Object element) { if (element != null) { elements.add(element); } } @Override public void printReport(PrintStream printStream) { printStream.print("Unique elements: " + elements.size()); printStream.println(); } } /** * Reports a histogram of elements found. */ private static final class HistogramProcessor implements StatsProcessor { private static final int MIN_BAR_WIDTH = 5; // 0 -> [0, 99], 1 -> [100, 199], etc. (bucket size is BUCKET_SIZE) private final Multiset<Integer> buckets = HashMultiset.create(); private static final int BUCKET_SIZE = 100; private final CLIConfig cliConfig; public HistogramProcessor(CLIConfig cliConfig) { this.cliConfig = cliConfig; } @Override public void process(Object element) { if (element != null && element instanceof Number) { Number number = (Number) element; int bucket = number.intValue() / BUCKET_SIZE; buckets.add(bucket); } } @Override public void printReport(PrintStream printStream) { if (!buckets.isEmpty()) { printStream.println("Histogram:"); List<Integer> sortedBuckets = Lists.newArrayList(buckets.elementSet()); Collections.sort(sortedBuckets); int maxCount = getBiggestBucket().getCount(); int longestPrefix = getLongestBucketPrefix(); // max length of the bar int maxBarLength = Math.max(MIN_BAR_WIDTH, cliConfig.getLineWidth() - longestPrefix); for (Integer bucketIndex : sortedBuckets) { Bucket bucket = new Bucket(bucketIndex, buckets.count(bucketIndex)); // print padded prefix: e.g. " [100, 199]: 123 " printStream.print(padRight(bucket.getPrefix(), longestPrefix)); // print the bar: e.g. ============> // TODO: determine barLength differently to show difference between 0 and low counts more clearly int barLength = (int) ((bucket.getCount() * 1.0 / maxCount) * maxBarLength); if (barLength == 0) { printStream.print("|"); } else if (barLength >= 1) { printStream.print("|" + Strings.repeat("+", barLength - 1)); } printStream.println(); } } } private Bucket getBiggestBucket() { Bucket biggestBucket = null; for (Integer bucketIndex : buckets.elementSet()) { Bucket bucket = new Bucket(bucketIndex, buckets.count(bucketIndex)); if (biggestBucket == null || bucket.getCount() > biggestBucket.getCount()) { biggestBucket = bucket; } } return biggestBucket; } private String padRight(String string, int padding) { return String.format("%1$-" + padding + "s", string); } private int getLongestBucketPrefix() { Set<Integer> bucketIndices = buckets.elementSet(); int longestBucket = Collections.max(bucketIndices, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return (Long.toString(o1 * BUCKET_SIZE).length() * 2 + Long.toString(buckets.count(o1)).length()) - (Long.toString(o2 * BUCKET_SIZE).length() * 2 + Long.toString(buckets.count(o2)).length()); } }); Bucket bucket = new Bucket(longestBucket, buckets.count(longestBucket)); String longestBucketPrefix = bucket.getPrefix(); return longestBucketPrefix.length(); } /** * */ private static final class Bucket { /** * index into {@link GetStreamStatsCommand.HistogramProcessor#buckets} */ private final int index; private final int count; private Bucket(int index, int count) { this.index = index; this.count = count; } public int getIndex() { return index; } public int getCount() { return count; } public int getStartInclusive() { return index * BUCKET_SIZE; } public int getEndInclusive() { return getStartInclusive() + (BUCKET_SIZE - 1); } public String getPrefix() { return String.format(" [%d, %d]: %d ", getStartInclusive(), getEndInclusive(), count); } } } }