/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator.cube; import java.io.IOException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.codehaus.jackson.JsonNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.operator.PreconditionException; /** * Interface that specifies aggregators for the CUBE operator. * <p> * Specifically, this aggregator applies for the cases where the grain of the data is same * as the grain of the CUBE. In other words, there are no "inner dimensions" that need to * be rolled up before computing the cube. See {@link DupleCubeAggregator} interface, * which allows rolling up of inner dimensions. * <p> * This aggregator is responsible for aggregating values for multiple value cuboids in the * cube. The total number of value cuboids that this aggregator will be asked to aggregate * is specified initially with the {@link allocate} method. * <p> * The {@code processTuple} method is called after each input tuple, and this aggregator * is expected to retrieve the relevant fields from the input tuple. * <p> * The {@link aggregate} method is called for each value cuboid. Each value cuboid is * uniquely indexed with an integer, which is provided to this method. * <p> * The {@link outputTuple} method is responsible for writing the aggregated value for a * given value cuboid into the provided output tuple. * <p> * Finally, the {@link outputSchema} must provide the schema of the output aggregate * fields. This method is called at the compile time, and is allowed to throw * {@link PreconditionException} which will result in compile time failure. * * @see DupleCubeAggregator * * @author Maneesh Varshney * */ public interface CubeAggregator { /** * Configure the aggregation operator. * * @param block * the block from where the tuple are generated * @param outputSchema * the schema of output tuple * @param json * a json node specifying the configuration properties. * @throws IOException */ void setup(Block block, BlockSchema outputSchema, JsonNode json) throws IOException; /** * Indicates the maximum number of value cuboids this aggregator must handle. * * @param size * the maximum number of value cuboids to be handled by this aggregtor */ void allocate(int size); /** * Clear the internal contents. This is called when the hash table is flushed, and built again with * more data. */ void clear(); /** * Handles the current input tuple. This aggregator is expected to retrieve the * relevant fields from the input tuple. * * @param tuple * the input tuple * @throws ExecException */ void processTuple(Tuple tuple) throws ExecException; /** * Aggregate the currentValue for the value cuboid at the specified index * * @param index * the index of the value cuboid for which to aggregate */ void aggregate(int index); /** * Write the final aggregated value for the specified value cuboid into the output * tuple. * * @param outputTuple * the tuple where the value is written * @param index * the index of the value cuboid * @throws ExecException */ void outputTuple(Tuple outputTuple, int index) throws ExecException; /** * Returns the schema of the output aggregate fields. * * @param inputSchema * schema of the input tuples * @param json * the JSON configuration for this aggregator * @return the schema of the output aggregate fields * @throws PreconditionException */ BlockSchema outputSchema(BlockSchema inputSchema, JsonNode json) throws PreconditionException; }