/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.query.op; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.core.BundleFactory; import com.addthis.bundle.core.BundleFormat; import com.addthis.bundle.util.AutoField; import com.addthis.bundle.util.IndexField; import com.addthis.bundle.value.ValueFactory; import com.addthis.hydra.data.query.AbstractQueryOp; import com.addthis.hydra.data.util.KeyPercentileDistribution; import com.yammer.metrics.stats.Snapshot; import io.netty.channel.ChannelProgressivePromise; /** * <p>This query operation <span class="hydra-summary">calculates the percentile distribution of a column</span>. * <p/> * <p>The syntax for the operation is distribution=[column number],[sample size]. The sample * size is optional and the default sample size is 1028. The result of this operation is a table * with two columns. Column 0 has percentile distributions and column 1 has the counts for * those percentile distributions.</p> * * @user-reference * @hydra-name distribution */ public class OpPercentileDistribution extends AbstractQueryOp { private final KeyPercentileDistribution histo; private final AutoField column; private final BundleFactory bundleFactory; /** * usage: column, sampleSize * <p/> * column defines the column source for the percentile value * sampleSize determines the size of the sample set to use when calculating percentiles */ public OpPercentileDistribution(BundleFactory bundleFactory, String args, ChannelProgressivePromise queryPromise) { super(queryPromise); this.bundleFactory = bundleFactory; int[] v = csvToInts(args); if (v.length < 1) { throw new RuntimeException("missing required column"); } column = new IndexField(v[0]); int sampleSize; if (v.length > 1) { sampleSize = v[1]; } else { sampleSize = 1028; } histo = new KeyPercentileDistribution(sampleSize).init(); } @Override public void send(Bundle bundle) { long ev = column.getLong(bundle).getAsLong(); histo.update(ev); } @Override public void sendComplete() { // prep bundle format Bundle bundle = bundleFactory.createBundle(); BundleFormat tableFormat = bundle.getFormat(); ensureMinimumFieldCount(tableFormat, 2); AutoField label = new IndexField(0); AutoField value = new IndexField(1); // output Snapshot snapshot = histo.getSnapshot(); writeLine(label, value, bundle, ".5", snapshot.getMedian()); writeLine(label, value, bundleFactory.createBundle(), ".75", snapshot.get75thPercentile()); writeLine(label, value, bundleFactory.createBundle(), ".95", snapshot.get95thPercentile()); writeLine(label, value, bundleFactory.createBundle(), ".98", snapshot.get98thPercentile()); writeLine(label, value, bundleFactory.createBundle(), ".99", snapshot.get99thPercentile()); writeLine(label, value, bundleFactory.createBundle(), ".999", snapshot.get999thPercentile()); getNext().sendComplete(); } private void writeLine(AutoField labelField, AutoField valueField, Bundle bundle, String label, double value) { labelField.setValue(bundle, ValueFactory.create(label)); valueField.setValue(bundle, ValueFactory.create(value)); getNext().send(bundle); } private static void ensureMinimumFieldCount(BundleFormat format, int targetCount) { int suffixNum = 0; while (format.getFieldCount() < targetCount) { format.getField("__op_percent_dist_anon_" + suffixNum); suffixNum += 1; } } }