package com.thinkbiganalytics.spark.dataprofiler.topn; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import java.io.Serializable; import java.util.Iterator; import java.util.TreeSet; /** * Class to store top-N items<br> */ @SuppressWarnings("serial") public class TopNDataList implements Serializable { /** * Delimiter to use when storing top-N values in result table<br> * This delimiter is output between fields of a single top-N entry */ public static final String TOP_N_VALUES_INTERNAL_DELIMITER = "^A"; /** * Delimiter to use when storing top-N values in result table<br> * This delimiter is output between top-N entries */ public static final String TOP_N_VALUES_RECORD_DELIMITER = "^B"; private final TreeSet<TopNDataItem> topNDataItemsForColumn; private final int maxSize; private Long lowestCountSoFar = Long.MAX_VALUE; /** * Constructor to set the number of items in top N list * * @param maxSize N in Top N */ public TopNDataList(int maxSize) { this.maxSize = (maxSize > 0) ? maxSize : 3; topNDataItemsForColumn = new TreeSet<>(); } /** * Add an item for inclusion in top-N list <br> * If two items have same count, the item that was first seen will be kept. * * @param newValue value * @param newCount count/frequency */ public void add(Object newValue, Long newCount) { if (topNDataItemsForColumn.size() >= maxSize) { if (newCount > lowestCountSoFar) { topNDataItemsForColumn.pollFirst(); addAndUpdateLowestCount(newValue, newCount); } } else { addAndUpdateLowestCount(newValue, newCount); } } /** * Helper method <br> * Add a new item in topN structure <br> * Update the lowest count in topN structure * * @param newValue value * @param newCount count/frequency */ private void addAndUpdateLowestCount(Object newValue, Long newCount) { topNDataItemsForColumn.add(new TopNDataItem(newValue, newCount)); lowestCountSoFar = topNDataItemsForColumn.first().getCount(); } /** * Print the top-N items as a string. This will give Top-N items in generally expected format (highest count first, lowest count last)<br> * * @return String of top-N items with configured delimiters within and between entries (Refer to configuration parameters in ProfilerConfiguration class) */ public String printTopNItems() { int index = 1; StringBuilder sb = new StringBuilder(); Iterator i = topNDataItemsForColumn.descendingIterator(); while (i.hasNext()) { TopNDataItem item = (TopNDataItem) i.next(); sb.append(index++).append(TOP_N_VALUES_INTERNAL_DELIMITER) .append(item.getValue()) .append(TOP_N_VALUES_INTERNAL_DELIMITER) .append(item.getCount()) .append(TOP_N_VALUES_RECORD_DELIMITER); } return sb.toString(); } /** * Print all the items ordered from highest count to lowest count <br> * String will have configured delimiters within and between entries (Refer to configuration parameters in ProfilerConfiguration class) */ @Override public String toString() { return printTopNItems(); } /** * Get the top-N items as an ordered set (lowest count to highest count) * * @return Set with top-N items ordered from lowest count to highest count */ public TreeSet<TopNDataItem> getTopNDataItemsForColumn() { return this.topNDataItemsForColumn; } }