/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.query.op; import java.io.Closeable; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.UUID; import java.util.concurrent.TimeUnit; import com.addthis.basis.util.MemoryCounter; import com.addthis.basis.util.Parameter; import com.addthis.bundle.channel.DataChannelError; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.core.BundleField; import com.addthis.bundle.core.list.ListBundle; import com.addthis.bundle.core.list.ListBundleFormat; import com.addthis.bundle.util.ValueUtil; import com.addthis.bundle.value.Numeric; import com.addthis.bundle.value.ValueObject; import com.addthis.hydra.data.query.AbstractQueryOp; import com.addthis.hydra.data.query.DiskBackedMap; import com.addthis.hydra.data.query.QueryOp; import com.addthis.hydra.data.query.op.merge.MergeConfig; import com.addthis.hydra.data.query.op.merge.MergedValue; import com.addthis.hydra.data.util.KeyTopper; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Meter; import io.netty.channel.ChannelProgressivePromise; /** * <p>This query operation <span class="hydra-summary">merges arbitrary rows</span>. * <p/> * <p>Gather collects all rows that match the criteria of the key columns. * It is an in-memory operation that spill over to disk when necessary. If the key * columns are already sorted then the {@link OpMerge merge} operation is * a much cheaper alternative.</p> * <p>The syntax for this operation is "gather=[column parameters] where * column parameters is a sequence of one or more of the following letters: * <ul> * <li>k - this column is a key column.</li> * <li>c - generates a cardinality estimation of this column.</li> * <li>i - this column is ignored and dropped from the output.</li> * <li>t - this column is a key topper.</li> * <li>a - generate average values for this column</li> * <li>d - generate iterated diff values for this column</li> * <li>m - generate min values for this column</li> * <li>M - generate max values for this column</li> * <li>s - generate sum values for this column</li> * <li>j - append all values for this column using "," as a separator</li> * <li>p - generate product values for this column</li> * </ul> * <p/> * <p>Key columns are specified using the "k" parameter. If two or more columns are * specified then the resulting keys will * be the <a href="http://en.wikipedia.org/wiki/Cartesian_product">cartesian product</a> of * the specified values. For non-key columns any rows that are merged apply the * column parameter operation to the values that are merged. A "u" character can be included * at the end of the column parameters to append a column that includes the number of merged * rows.</p> * <p/> * <p>Example:</p> * <pre> * 0 A 3 * 1 A 1 * 1 B 2 * 0 A 5 * * gather=iks * * A 9 * B 2 * </pre> * * @user-reference * @hydra-name gather */ public class OpGather extends AbstractQueryOp { public static Numeric num(ValueObject o) { Numeric num = ValueUtil.asNumberOrParseLong(o, 10); return num != null ? num : ZERO; } private Map<String, MergedRow> resultTable = new HashMap<>(); private final ListBundleFormat format = new ListBundleFormat(); private final MergedValue[] conf; private final long tipMem; private final long tipRow; private long totalMem; private final MergeConfig mergeConfig; private final KeyTopper topper; private final int topSize; private final int topColumn; private final String tmpDir; private boolean tippedToDisk = false; private boolean tipToDisk = Parameter.boolValue("opgather.tiptodisk", false); private static final Meter diskTips = Metrics.newMeter(OpGather.class, "diskTips", "diskTips", TimeUnit.SECONDS); public OpGather(String args, long tipMem, long tipRow, String tmpDir, ChannelProgressivePromise queryPromise) { super(queryPromise); this.tmpDir = tmpDir; this.tipMem = tipMem; this.tipRow = tipRow; totalMem = 0; mergeConfig = new MergeConfig(args); topColumn = mergeConfig.topColumn; topper = mergeConfig.topper; topSize = mergeConfig.numericArg; conf = mergeConfig.conf; } @Override public void send(Bundle row) throws DataChannelError { if (opPromise.isDone()) { return; } String key = mergeConfig.handleBindAndGetKey(row, format); MergedRow merge = resultTable.get(key); if (merge == null) { merge = new MergedRow(conf, new ListBundle(format)); resultTable.put(key, merge); if (!tippedToDisk) { totalMem += MemoryCounter.estimateSize(merge); } } if (!tippedToDisk) { totalMem -= MemoryCounter.estimateSize(merge); } merge.merge(row); if (tippedToDisk) { // Update the result on the disk, we need to put again resultTable.put(key, merge); } if (!tippedToDisk) { totalMem += MemoryCounter.estimateSize(merge); } if (topColumn >= 0) { BundleField topColumnTo = conf[topColumn].getTo(); if (topColumnTo != null) { Numeric num = num(merge.getValue(topColumnTo)); if (num == null) { return; } String drop = topper.update(key, num.asLong().getLong(), topSize); if (drop != null) { if (!tippedToDisk) { totalMem -= MemoryCounter.estimateSize(resultTable.get(drop)); } resultTable.remove(drop); } } } if (!tipToDisk) { // If we're not tipping to disk, and the tips are set, then we will issue errors if we pass them if (tipMem > 0 && totalMem > tipMem) { throw new DataChannelError("Memory usage of gathered objects exceeds allowed " + tipMem); } if (tipRow > 0 && resultTable.size() > tipRow) { throw new DataChannelError("Number of gathered rows exceeds allowed " + tipRow); } } else { // If we're tipping to disk, and the tips are non zero, then spill to disk once we pass them if (!tippedToDisk && ((tipMem > 0 && totalMem > tipMem) || (tipRow > 0 && resultTable.size() > tipRow))) { tippedToDisk = true; diskTips.mark(); // Use the smaller amount of memory for the JE cache environment long memToUse = totalMem; if (memToUse > tipMem) { memToUse = tipMem; } Map<String, MergedRow> diskMap = new DiskBackedMap<>(tmpDir + "/" + UUID.randomUUID(), new MergedRowFactory(conf, format), memToUse); diskMap.putAll(resultTable); resultTable = diskMap; } } } @Override public void sendComplete() { QueryOp next = getNext(); for (MergedRow mergedRow : resultTable.values()) { if (!opPromise.isDone()) { next.send(mergedRow.emit()); } else { break; } } next.sendComplete(); } @Override public void close() throws IOException { if (resultTable instanceof Closeable) { ((Closeable) resultTable).close(); } } }