/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ArrayNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.block.PivotedBlock; import com.linkedin.cubert.operator.aggregate.AggregationFunction; import com.linkedin.cubert.operator.aggregate.AggregationFunctions; import com.linkedin.cubert.operator.aggregate.AggregationType; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.JsonUtils; /** * Group by operator implementation for the framework. The main assumption is that the * group by keys are also the pivot keys of the input cube. * * @author Krishna Puttaswamy * */ public class GroupByOperator implements TupleOperator { public static final String GROUP_BY_COLUMNNAMES = "groupBy"; private Block pivotedBlock; private Tuple output = null; private int[] groupByColumnIndex; private final List<AggregationFunction> aggregators = new ArrayList<AggregationFunction>(); private boolean moreData = true; private boolean isGroupedAll = false; @Override public void setInput(Map<String, Block> input, JsonNode root, BlockProperties props) throws IOException, InterruptedException { // get the input block Block dataBlock = input.values().iterator().next(); BlockSchema inputSchema = dataBlock.getProperties().getSchema(); BlockSchema outputSchema = props.getSchema(); output = TupleFactory.getInstance().newTuple(outputSchema.getNumColumns()); // create pivoted block if (((ArrayNode) root.get(GROUP_BY_COLUMNNAMES)).size() > 0) { isGroupedAll = false; String[] groupByColumns = JsonUtils.asArray(root, GROUP_BY_COLUMNNAMES); pivotedBlock = new PivotedBlock(dataBlock, groupByColumns); // store the index of groupby columns BlockSchema groupBySchema = inputSchema.getSubset(groupByColumns); groupByColumnIndex = new int[groupBySchema.getNumColumns()]; for (int i = 0; i < groupByColumnIndex.length; i++) { groupByColumnIndex[i] = inputSchema.getIndex(groupBySchema.getName(i)); } } else { isGroupedAll = true; pivotedBlock = dataBlock; } if (root.has("aggregates")) { for (JsonNode aggregateJson : root.path("aggregates")) { AggregationType aggType = AggregationType.valueOf(JsonUtils.getText(aggregateJson, "type")); AggregationFunction aggregator = AggregationFunctions.get(aggType, aggregateJson); aggregator.setup(pivotedBlock, outputSchema, aggregateJson); aggregators.add(aggregator); } } } @Override public Tuple next() throws IOException, InterruptedException { // for all the tuples in the current pivot, apply the aggregation operator and // then return the tuple with the aggregation columns added; if (!moreData) return null; // reset the aggregator state and the output tuple (the output tuple is reused) for (AggregationFunction aggregator : aggregators) { aggregator.resetState(); } // read the input rows and compute the aggregates Tuple tuple; boolean firstTuple = true; boolean noRecordSeen = true; while ((tuple = pivotedBlock.next()) != null) { if (firstTuple) { if (!isGroupedAll) { for (int i = 0; i < groupByColumnIndex.length; i++) { output.set(i, tuple.get(groupByColumnIndex[i])); } } firstTuple = false; noRecordSeen = false; } for (AggregationFunction aggregator : aggregators) { aggregator.aggregate(tuple); } } // copy the computed aggregates to the output tuple for (AggregationFunction aggregator : aggregators) { aggregator.output(output); } if (isGroupedAll) moreData = false; else moreData = ((PivotedBlock) pivotedBlock).advancePivot(); if (noRecordSeen) return null; else return output; } @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { PostCondition condition = preConditions.values().iterator().next(); BlockSchema inputSchema = condition.getSchema(); String[] partitionKeys = condition.getPartitionKeys(); String[] sortKeys = condition.getSortKeys(); if (condition.getPivotKeys() != null) sortKeys = CommonUtils.concat(condition.getPivotKeys(), sortKeys); BlockSchema outputSchema; String[] groupByColumns = JsonUtils.asArray(json, GROUP_BY_COLUMNNAMES); // test that group by columns are present for (String groupByColumn : groupByColumns) { if (!inputSchema.hasIndex(groupByColumn)) throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT, "Column [" + groupByColumn + "] not present."); } // test that block is sorted on group by columns if (groupByColumns.length > 0) { if (!CommonUtils.isPrefix(sortKeys, groupByColumns)) { System.out.println("Input SortKeys = " + Arrays.toString(sortKeys)); throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS); } } // generate the output schema if (((ArrayNode) json.get(GROUP_BY_COLUMNNAMES)).size() > 0) { outputSchema = inputSchema.getSubset(groupByColumns); } else { outputSchema = new BlockSchema(new ColumnType[] {}); } String[] fullExpectedSortKeys = groupByColumns; boolean countDistinctAggPresent = false; if (json.has("aggregates")) { for (JsonNode aggregateJson : json.path("aggregates")) { // BlockSchema aggOutputSchema; AggregationType aggType = AggregationType.valueOf(JsonUtils.getText(aggregateJson, "type")); AggregationFunction aggregator = null; aggregator = AggregationFunctions.get(aggType, aggregateJson); if (aggregator == null) throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG, "Cannot instantiate aggregation operator for type " + aggType); BlockSchema aggOutputSchema = aggregator.outputSchema(inputSchema, aggregateJson); outputSchema = outputSchema.append(aggOutputSchema); // Check pre-condition for COUNT-DISTINCT String[] measureColumn = JsonUtils.asArray(aggregateJson.get("input")); if (aggType == AggregationType.COUNT_DISTINCT) { if (countDistinctAggPresent) throw new PreconditionException(PreconditionExceptionType.INVALID_GROUPBY); countDistinctAggPresent = true; fullExpectedSortKeys = CommonUtils.concat(groupByColumns, measureColumn); if (!CommonUtils.isPrefix(sortKeys, fullExpectedSortKeys)) { String errorMesg = "Expecting sortkeys = " + CommonUtils.join(fullExpectedSortKeys, ",") + " actual = " + CommonUtils.join(sortKeys, ","); throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS, errorMesg); } } } } return new PostCondition(outputSchema, partitionKeys, groupByColumns); } }