/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator.cube; import java.io.IOException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.operator.AggregationBuffer; /** * Generic interface for all cube aggregators. * * This relies on an external storage type <code>AggregationBuffer</code> which stores the * partial and complete results of computation. Implementing classes would need to provide * and API to return a fresh object (on demand) of the storage type extended on * <code>AggregationBuffer</code> * * @author Mani Parkhe */ public interface EasyCubeAggregator { /* * setup aggregator. This method will be called only ONCE in the entire flow per * instance of this aggregator. * * The objective would be to save indexes of columns in <code>inputSchema</code> and * other meta data. */ public void setup(BlockSchema inputSchema) throws FrontendException; /* * This method will me called ONCE per <code>inputTuple</code>. * * The objective would be to save/update shared state. */ public void processTuple(Tuple inputTuple) throws ExecException; /* * Main aggregation operation. * * Will be called MULTIPLE times for each <code>inputTuple</code> for each qualifying * combination of dimensions based on # grouping sets. */ public void aggregate(AggregationBuffer aggregationBuffer); /* * Called ONCE on each <code>AggregationBuffer</code> object that is updated during * the processing of the current measure. */ public void endMeasure(AggregationBuffer aggregationBuffer); /* * Returns a fresh copy of storage class extended from <code>AggreagtionBuffer</code>. */ public AggregationBuffer getAggregationBuffer(); /* * Publish output schema for this aggregation. */ public FieldSchema outputSchema(Schema inputSchema) throws IOException; /* * Extract output data from aggregation buffer and output it as described as in * <code>FieldSchema</code> * * @parameter : <code>reUsedOutput</code> is an optimization should the user chose to * re-use the output object. First call to <code>output</code> method would send a * <code>null</code> for <code>reUsedOutput</code>. This method should check for this * condition and allocate an object and return it to caller as return argument. Future * calls to this method would re-send this previously returned object. */ public Object output(Object reUsedOutput, AggregationBuffer aggregationBuffer) throws ExecException; }