package edu.brown.costmodel; import java.util.ArrayList; import org.apache.log4j.Logger; import edu.brown.statistics.Histogram; import edu.brown.utils.MathUtil; /** * @author pavlo */ public abstract class SkewFactorUtil { private static final Logger LOG = Logger.getLogger(SkewFactorUtil.class); private static final int PERCISION = 6; private static final String DEBUG_F = "[%02d] current=%d, orig_r=%.05f, r=%.05f, log=%.05f, skew=%.05f\n"; /** * Calculate the skew factor of a histogram. This is an estimation of how * uniformly the partitions are accessed * * @param num_partitions * @param total_ctr * @param h * @return */ public static double calculateSkew(int num_partitions, long total_ctr, Histogram<Integer> h) { assert (num_partitions > 0) : "Number of partitions can't be zero"; assert (total_ctr > 0) : "Total cannot be zero [valueCount=" + h.getValueCount() + ", sampleCount=" + h.getSampleCount() + "]"; final boolean debug = LOG.isDebugEnabled(); if (debug) LOG.debug("Calcuating skew for histogram [num_partitions=" + num_partitions + ", total_ctr=" + total_ctr + "]"); // The skew factor is a calculation of how evenly the partitions are // accessed double skew = 0.0d; // The best skew is when all of the partitions have an equal number of // elements // As such this is always zero double best = 0.0d; double best_ratio = 1 / (double) num_partitions; // If a ratio for a partition is above the best_ratio, then we just can // take the log of its difference. // If it is below the ratio, then we need to flip it so that it is // greater than the best_ratio, but // we also need to scale the ratio it so that 0.0 has the same cost as // 1.0 // The worst skew is always zero. This would occur if all of the counted // elements // occured on a single partition, while all other partitions were zero // in the histogram // Given that, then the ratio for that single partition is 1.0, and // log(1.0) == 0 // double worst = MathUtil.roundToDecimals(Math.log(1.0 / best_ratio) + // Math.abs(Math.log(2.0) * (best_ratio * 2) * (num_partitions - 1)), // PERCISION); // double worst = MathUtil.roundToDecimals(Math.log(1.0 / best_ratio) * // num_partitions, PERCISION); double worst = Math.log(1.0 / best_ratio) * num_partitions; // Iterate through the partitions and calculate the skew summation double log = 0.0d; double ratio = 0.0d; double orig_ratio = 0.0d; ArrayList<Long> counts = new ArrayList<Long>(); StringBuilder sb = (debug ? new StringBuilder() : null); for (int i = 0; i < num_partitions; i++) { long current = h.get(i, 0); counts.add(current); orig_ratio = ratio = current / (double) total_ctr; // If the ratio is less than the best_ratio, then we flip it to be a // ratio above the // best_ratio. We have to normalize it so that if we are X% below // the best_ratio, then // the new value has to be X% above the best_ratio if (ratio < best_ratio) ratio = best_ratio + ((1.0 - (ratio / best_ratio)) * (1.0 - best_ratio)); assert (ratio >= best_ratio) : "Invalid ratio: " + ratio; assert (ratio <= 1.0) : "Invalid ratio: " + ratio; log = Math.abs(Math.log(ratio / best_ratio)); // log = Math.abs(Math.log(abs_ratio / best_ratio) * abs_ratio); skew += log; if (debug) sb.append(String.format(DEBUG_F, i, current, orig_ratio, ratio, log, skew)); } // FOR // skew = MathUtil.roundToDecimals(skew, PERCISION); if (debug) { // LOG.debug("values = " + counts); LOG.debug("Skew: " + skew); LOG.debug("Best: " + best); LOG.debug("BestRatio: " + best_ratio); LOG.debug("Worst: " + worst); // We use the min and max skew values to normalize the calculated // skew to be between [0, 1] if (skew > worst) { System.err.println("\n" + sb.toString()); System.err.println(h); System.err.println(skew + " <= " + worst); } } // assert(skew <= worst); assert (MathUtil.lessThanEquals(skew, worst, 0.0001)) : skew + " <= " + worst; assert (skew >= best) : skew + " >= " + best; final double final_value = (skew / worst); if (debug) LOG.debug("Final: " + final_value); return (final_value); // return (Math.abs(1.0d - (skew - worst) / (best - worst))); } }