/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.operator;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.block.PivotedBlock;
import com.linkedin.cubert.memory.CompactHashTableBase;
import com.linkedin.cubert.memory.IntIterator;
import com.linkedin.cubert.memory.IntSet;
import com.linkedin.cubert.operator.cube.CountDistinctCubeAggregator;
import com.linkedin.cubert.operator.cube.CubeAggregator;
import com.linkedin.cubert.operator.cube.CubeDimensions;
import com.linkedin.cubert.operator.cube.DefaultCubeAggregator;
import com.linkedin.cubert.operator.cube.DefaultDupleCubeAggregator;
import com.linkedin.cubert.operator.cube.DimensionKey;
import com.linkedin.cubert.operator.cube.DupleCubeAggregator;
import com.linkedin.cubert.operator.cube.EasyCubeAggregator;
import com.linkedin.cubert.operator.cube.EasyCubeAggregatorBridge;
import com.linkedin.cubert.operator.cube.ValueAggregationType;
import com.linkedin.cubert.operator.cube.ValueAggregator;
import com.linkedin.cubert.operator.cube.ValueAggregatorFactory;
import com.linkedin.cubert.utils.ClassCache;
import com.linkedin.cubert.utils.CommonUtils;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.Pair;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
/**
*
* @author Maneesh Varshney
*
*/
public class CubeOperator implements TupleOperator
{
private static final Log LOG = LogFactory.getLog(CubeOperator.class.getName());
// Default operator configurations
private static final int DEFAULT_HASH_TABLE_SIZE = 2000000;
// inputs
private boolean hasInnerDimensions = false;
private Block inputBlock;
// outputs
private Tuple outputTuple;
// aggregators
private final List<CubeAggregator> aggregators = new ArrayList<CubeAggregator>();
private final List<DupleCubeAggregator> dupleAggregators =
new ArrayList<DupleCubeAggregator>();
// hash table related
private int hashTableSize = DEFAULT_HASH_TABLE_SIZE;
private double flushThreshold = 0.95;
private CompactHashTableBase hashTable;
private Iterator<Pair<DimensionKey, Integer>> iterator;
private final IntSet indexSet = new IntSet();
// dimension key related
private CubeDimensions dimensions;
// runtime state management
private boolean inputAvailable = true;
private Counter flushCounter;
@Override
public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException,
InterruptedException
{
// get the input block and the input schema
inputBlock = input.values().iterator().next();
BlockSchema inputSchema = inputBlock.getProperties().getSchema();
// get the output block schema
BlockSchema outputSchema = props.getSchema();
// read configurations from json
String[] dimensionNames = JsonUtils.asArray(json, "dimensions");
String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions");
if (json.has("hashTableSize") && !json.get("hashTableSize").isNull())
hashTableSize = json.get("hashTableSize").getIntValue();
hasInnerDimensions = (innerDimensions != null);
if (hasInnerDimensions)
inputBlock = new PivotedBlock(inputBlock, innerDimensions);
// create aggregators
List<CubeAggInfo> aggs = new ArrayList<CubeAggInfo>();
List<DupleCubeAggInfo> dupleAggs = new ArrayList<DupleCubeAggInfo>();
try
{
createAggregators(json, inputSchema, hasInnerDimensions, aggs, dupleAggs);
}
catch (PreconditionException e)
{
// this will not happen, since this method is also called in the
// getPostCondition method, and any PreconditionException will be caught at
// compile time
throw new RuntimeException(e);
}
// initialize and allocate additive aggregates
for (CubeAggInfo info : aggs)
{
info.getFirst().setup(inputBlock, outputSchema, info.getSecond());
info.getFirst().allocate(hashTableSize);
aggregators.add(info.getFirst());
}
// initialize and allocate partitioned additive aggregates
for (DupleCubeAggInfo info : dupleAggs)
{
info.getFirst().setup(inputBlock, outputSchema, info.getSecond());
info.getFirst().allocate(hashTableSize);
dupleAggregators.add(info.getFirst());
}
// create the single copy of output tuple
outputTuple = TupleFactory.getInstance().newTuple(outputSchema.getNumColumns());
// initialize CubeDimensions
dimensions =
new CubeDimensions(inputSchema,
outputSchema,
dimensionNames,
json.get("groupingSets"));
// create compact hash table
hashTable =
new CompactHashTableBase(dimensions.getDimensionKeyLength(),
hashTableSize);
// set the flush threshold (if defined in conf)
flushThreshold =
PhaseContext.getConf().getFloat("cubert.cube.flush.threshold", (float) flushThreshold);
flushCounter = CubertCounter.CUBE_FLUSH_COUNTER.getCounter();
}
/**
* Process input tuples for cubing without inner dimensions. Note that
* DupleCubeAggregators cannot be used here (any attempt to use such aggregators would
* have be caught at the compile time).
*
* @return boolean flag to indicate if there is more input to be processed
* @throws IOException
* @throws InterruptedException
*/
private boolean processWithoutInnerDimensions() throws IOException,
InterruptedException
{
if (!inputAvailable)
return false;
Tuple tuple;
while ((tuple = inputBlock.next()) != null)
{
// only the additive aggregators can be handled
for (CubeAggregator agg : aggregators)
agg.processTuple(tuple);
DimensionKey[] ancestors = dimensions.ancestors(tuple);
for (DimensionKey ancestor : ancestors)
{
Pair<Integer, Boolean> idx = hashTable.lookupOrCreateIndex(ancestor);
for (CubeAggregator agg : aggregators)
agg.aggregate(idx.getFirst());
}
if (hashTable.size() >= hashTableSize * flushThreshold)
{
flushCounter.increment(1);
break;
}
}
if (tuple == null)
inputAvailable = false;
iterator = hashTable.getIterator();
return true;
}
/**
* Process input tuples for cubing WITH inner dimensions.
*
* @return boolean flag to indicate if there is more input to be processed
* @return
* @throws IOException
* @throws InterruptedException
*/
private boolean processWithInnerDimensions() throws IOException,
InterruptedException
{
if (!inputAvailable)
return false;
while (true)
{
Tuple tuple;
indexSet.clear();
int maxIndex = 0;
while ((tuple = inputBlock.next()) != null)
{
for (CubeAggregator agg : aggregators)
agg.processTuple(tuple);
for (CubeAggregator agg : dupleAggregators)
agg.processTuple(tuple);
DimensionKey[] ancestors = dimensions.ancestors(tuple);
for (DimensionKey ancestor : ancestors)
{
Pair<Integer, Boolean> idx = hashTable.lookupOrCreateIndex(ancestor);
for (CubeAggregator agg : aggregators)
agg.aggregate(idx.getFirst());
for (DupleCubeAggregator agg : dupleAggregators)
agg.innerAggregate(idx.getFirst());
Integer index = idx.getFirst();
maxIndex = Math.max(maxIndex, index.intValue());
indexSet.add(index);
}
}
// Ensure capacity
if (maxIndex + 1 > hashTableSize)
{
for (DupleCubeAggregator agg : dupleAggregators)
agg.allocate(maxIndex);
}
IntIterator it = indexSet.iterator();
while (it.hasNext())
{
int index = it.next();
for (DupleCubeAggregator agg : dupleAggregators)
agg.aggregate(index);
}
if (!((PivotedBlock) inputBlock).advancePivot())
{
inputAvailable = false;
break;
}
if (hashTable.size() >= hashTableSize * flushThreshold) {
flushCounter.increment(1);
break;
}
}
iterator = hashTable.getIterator();
return true;
}
private boolean process() throws IOException,
InterruptedException
{
hashTable.clear();
for (CubeAggregator agg: this.aggregators)
agg.clear();
for (DupleCubeAggregator agg: this.dupleAggregators)
agg.clear();
if (hasInnerDimensions)
return processWithInnerDimensions();
else
return processWithoutInnerDimensions();
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
if (iterator == null)
{
if (!process())
return null;
}
if (iterator.hasNext())
{
Pair<DimensionKey, Integer> pair = iterator.next();
DimensionKey key = pair.getFirst();
int index = pair.getSecond();
dimensions.outputKey(key, outputTuple);
for (CubeAggregator agg : aggregators)
agg.outputTuple(outputTuple, index);
for (CubeAggregator agg : dupleAggregators)
agg.outputTuple(outputTuple, index);
return outputTuple;
}
else
{
iterator = null;
return next();
}
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
PostCondition condition = preConditions.values().iterator().next();
BlockSchema inputSchema = condition.getSchema();
String[] dimensions = JsonUtils.asArray(JsonUtils.get(json, "dimensions"));
String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions");
// validate that dimensions columns exist and are INT, LONG or STRING
for (String dim : dimensions)
{
if (!inputSchema.hasIndex(dim))
throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT,
dim);
DataType type = inputSchema.getType(inputSchema.getIndex(dim));
if (!type.isIntOrLong() && type != DataType.BOOLEAN && !type.equals(DataType.STRING))
throw new PreconditionException(PreconditionExceptionType.INVALID_DIMENSION_TYPE,
"Expecting type: BOOLEAN, INT, LONG or STRING. Found: "
+ type);
}
// validate inner dimensions (if specified)
if (innerDimensions != null)
{
// validate that innerDimensions exist
for (String dim : innerDimensions)
{
if (!inputSchema.hasIndex(dim))
throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT,
dim);
}
// validate that block is partitioned on inner dimensions
String[] partitionKeys = condition.getPartitionKeys();
if (partitionKeys == null || partitionKeys.length == 0
|| !CommonUtils.isPrefix(innerDimensions, partitionKeys))
{
String msg =
String.format("Expected: %s. Found: %s",
Arrays.toString(innerDimensions),
Arrays.toString(partitionKeys));
throw new PreconditionException(PreconditionExceptionType.INVALID_PARTITION_KEYS,
msg);
}
// validate that block is sorted on inner dimensions
String[] sortKeys = condition.getSortKeys();
if (sortKeys == null || sortKeys.length == 0
|| !CommonUtils.isPrefix(sortKeys, innerDimensions))
{
String msg =
String.format("Expected: %s. Found: %s",
Arrays.toString(innerDimensions),
Arrays.toString(sortKeys));
throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS,
msg);
}
}
// validate that dimensions in groupingSets are valid dimensions
JsonNode gsJson = json.get("groupingSets");
if (gsJson != null && !gsJson.isNull() && gsJson.size() > 0)
{
String[] gsInput = JsonUtils.asArray(gsJson);
Set<String> dimensionSet = new HashSet<String>();
for (int i = 0; i < dimensions.length; i++)
dimensionSet.add(dimensions[i]);
for (int i = 0; i < gsInput.length; i++)
{
String[] fields = gsInput[i].split(",");
for (String field : fields)
{
if (field.equals(""))
continue; // roll up everything TODO: check ROLLUP clause (?)
if (!dimensionSet.contains(field))
{
String msg =
String.format("Dimension \"%s\" in grouping set (%s) is not a valid dimension",
field,
gsInput[i]);
throw new PreconditionException(PreconditionExceptionType.INVALID_DIMENSION_TYPE,
msg);
}
}
}
}
// generate output schema
BlockSchema outputSchema = createOutputSchema(inputSchema, json);
// create post condition
return new PostCondition(outputSchema, condition.getPartitionKeys(), null);
}
private static final class CubeAggInfo extends Pair<CubeAggregator, JsonNode>
{
private static final long serialVersionUID = 3313689844388231187L;
public CubeAggInfo(CubeAggregator x, JsonNode y)
{
super(x, y);
}
}
private static final class DupleCubeAggInfo extends
Pair<DupleCubeAggregator, JsonNode>
{
private static final long serialVersionUID = -550007348499616264L;
public DupleCubeAggInfo(DupleCubeAggregator x, JsonNode y)
{
super(x, y);
}
}
private static BlockSchema createOutputSchema(BlockSchema inputSchema, JsonNode json) throws PreconditionException
{
List<CubeAggInfo> additiveAggs = new ArrayList<CubeAggInfo>();
List<DupleCubeAggInfo> partitionedAdditiveAggs =
new ArrayList<DupleCubeAggInfo>();
final String[] innerDimensions = JsonUtils.asArray(json, "innerDimensions");
createAggregators(json,
inputSchema,
innerDimensions != null,
additiveAggs,
partitionedAdditiveAggs);
Map<JsonNode, BlockSchema> aggMap = new HashMap<JsonNode, BlockSchema>();
for (CubeAggInfo info : additiveAggs)
{
JsonNode aggNode = info.getSecond();
aggMap.put(aggNode, info.getFirst().outputSchema(inputSchema, aggNode));
}
for (DupleCubeAggInfo info : partitionedAdditiveAggs)
{
JsonNode aggNode = info.getSecond();
aggMap.put(aggNode, info.getFirst().outputSchema(inputSchema, aggNode));
}
BlockSchema outputSchema =
inputSchema.getSubset(JsonUtils.asArray(JsonUtils.get(json, "dimensions")));
for (JsonNode aggregateJson : json.get("aggregates"))
outputSchema = outputSchema.append(aggMap.get(aggregateJson));
return outputSchema;
}
private static void createAggregators(JsonNode json,
BlockSchema inputSchema,
boolean hasInnerDimensions,
List<CubeAggInfo> aggs,
List<DupleCubeAggInfo> dupleAggs) throws PreconditionException
{
for (JsonNode aggregateJson : json.get("aggregates"))
{
JsonNode typeJson = aggregateJson.get("type");
// validate that type is defined in json
if (typeJson == null || typeJson.isNull())
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG,
"<type> property not defined in Json: "
+ typeJson.toString());
// validate that type is a string or array
if (!typeJson.isTextual() && !typeJson.isArray())
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG,
"<type> property not text or array: "
+ typeJson.toString());
// if array, validate that type has one or two items
if (typeJson.isArray() && !(typeJson.size() == 1 || typeJson.size() == 2))
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG,
"<type> property as array can have either one or two items: "
+ typeJson.toString());
// validate that the input columns are present in input schema
String[] inputColNames = null;
DataType[] inputColTypes = null;
if (aggregateJson.has("input") && !aggregateJson.get("input").isNull())
{
inputColNames = JsonUtils.asArray(aggregateJson, "input");
inputColTypes = new DataType[inputColNames.length];
int idx = 0;
for (String colName : inputColNames)
{
if (!inputSchema.hasIndex(colName))
throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT,
colName);
inputColTypes[idx++] =
inputSchema.getType(inputSchema.getIndex(colName));
}
}
// handle first the special case of array with two items
if (typeJson.isArray() && typeJson.size() == 2)
{
String[] aggregators = JsonUtils.asArray(typeJson);
ValueAggregationType outerType =
getCubeAggregationType(aggregators[0], true);
ValueAggregationType innerType =
getCubeAggregationType(aggregators[1], true);
// the "type" of inner aggregate is the type of input column
ValueAggregator innerAggregator =
ValueAggregatorFactory.get(innerType,
inputColTypes[0],
inputColNames[0]);
// the "type" of outer aggregate is the output type of inner aggregate
ValueAggregator outerAggregator =
ValueAggregatorFactory.get(outerType,
innerAggregator.outputType(),
inputColNames[0]);
DupleCubeAggregator cubeAggregator =
new DefaultDupleCubeAggregator(outerAggregator, innerAggregator);
if (!hasInnerDimensions)
errorInnerDimensionsNotSpecified(java.util.Arrays.toString(aggregators));
dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson));
}
else
{
String type =
typeJson.isArray() ? typeJson.get(0).getTextValue()
: typeJson.getTextValue();
ValueAggregationType aggType = getCubeAggregationType(type, false);
// if this is builtin aggregator
if (aggType != null)
{
ValueAggregator aggregator =
ValueAggregatorFactory.get(aggType, inputColTypes == null
? null : inputColTypes[0], inputColNames == null
? null : inputColNames[0]);
CubeAggregator cubeggregator = new DefaultCubeAggregator(aggregator);
aggs.add(new CubeAggInfo(cubeggregator, aggregateJson));
}
else if (type.equals("COUNT_DISTINCT"))
{
if (!hasInnerDimensions)
errorInnerDimensionsNotSpecified(type);
DupleCubeAggregator cubeAggregator =
new CountDistinctCubeAggregator(inputColNames[0]);
dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson));
}
// this is udaf
else
{
Object object = null;
try
{
Class<?> cls = ClassCache.forName(type);
object =
instantiateObject(cls,
aggregateJson.get("constructorArgs"));
}
catch (ClassNotFoundException e)
{
throw new PreconditionException(PreconditionExceptionType.CLASS_NOT_FOUND,
type);
}
catch (Exception e)
{
throw new PreconditionException(PreconditionExceptionType.MISC_ERROR,
e.getClass().getSimpleName()
+ " " + e.getMessage()
+ " for class: " + type);
}
if (object instanceof DupleCubeAggregator)
{
DupleCubeAggregator cubeAggregator = (DupleCubeAggregator) object;
if (!hasInnerDimensions)
errorInnerDimensionsNotSpecified(type);
dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson));
}
else if (object instanceof CubeAggregator)
{
CubeAggregator cubeAggregator = (CubeAggregator) object;
aggs.add(new CubeAggInfo(cubeAggregator, aggregateJson));
}
else if (object instanceof EasyCubeAggregator)
{
EasyCubeAggregatorBridge cubeAggregator =
new EasyCubeAggregatorBridge((EasyCubeAggregator) object);
if (!hasInnerDimensions)
errorInnerDimensionsNotSpecified(type);
dupleAggs.add(new DupleCubeAggInfo(cubeAggregator, aggregateJson));
}
else
{
String msg =
String.format("%s should implement one of these interfaces: AdditiveCubeAggregate, PartitionedAdditiveAggregate, EasyCubeAggregate",
type);
throw new PreconditionException(PreconditionExceptionType.MISC_ERROR,
msg);
}
}
}
}
}
private static void errorInnerDimensionsNotSpecified(String aggName) throws PreconditionException
{
String msg =
String.format("INNER dimensions must be specified for the %s PartitionedAdditive aggregator",
aggName);
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, msg);
}
private static Object instantiateObject(Class<?> cls, JsonNode constructorArgs) throws InstantiationException,
IllegalAccessException,
IllegalArgumentException,
SecurityException,
InvocationTargetException,
NoSuchMethodException
{
if (constructorArgs == null || constructorArgs.isNull())
return cls.newInstance();
Object[] args = new Object[constructorArgs.size()];
Class<?>[] argClasses = new Class[args.length];
for (int i = 0; i < args.length; i++)
{
args[i] = JsonUtils.asObject(constructorArgs.get(i));
argClasses[i] = args[i].getClass();
}
return cls.getConstructor(argClasses).newInstance(args);
}
private static ValueAggregationType getCubeAggregationType(String name,
boolean errorOnMissing) throws PreconditionException
{
try
{
return ValueAggregationType.valueOf(name.toUpperCase());
}
catch (IllegalArgumentException e)
{
if (errorOnMissing)
{
String msg =
String.format("Aggregator [%s] not found. Valid aggregators: %s",
name,
java.util.Arrays.toString(ValueAggregationType.values()));
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG,
msg);
}
}
return null;
}
}