/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.plan.physical; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Reducer; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.CommonContext; import com.linkedin.cubert.block.ContextBlock; import com.linkedin.cubert.operator.OperatorFactory; import com.linkedin.cubert.operator.OperatorType; import com.linkedin.cubert.operator.PostCondition; import com.linkedin.cubert.operator.PreconditionException; import com.linkedin.cubert.operator.PreconditionExceptionType; import com.linkedin.cubert.operator.TupleOperator; import com.linkedin.cubert.operator.aggregate.AggregationFunction; import com.linkedin.cubert.operator.aggregate.AggregationFunctions; import com.linkedin.cubert.operator.aggregate.AggregationType; import com.linkedin.cubert.plan.physical.CubertReducer.ReduceContext; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.TupleUtils; /** * Combines data at the mapper during the shuffle stage. * * This object is instantiated if the job configuration has "shuffle" property, and * further, if the shuffle property as "aggregations" property. * * @author Maneesh Varshney * */ public class CubertCombiner extends Reducer<Tuple, Tuple, Tuple, Tuple> { @Override public void run(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); ObjectMapper mapper = new ObjectMapper(); JsonNode shuffleJson = mapper.readValue(conf.get(CubertStrings.JSON_SHUFFLE), JsonNode.class); ObjectNode groupByJson = mapper.createObjectNode(); groupByJson.put("name", shuffleJson.get("name")); groupByJson.put("type", shuffleJson.get("type")); groupByJson.put("groupBy", shuffleJson.get("pivotKeys")); groupByJson.put("aggregates", shuffleJson.get("aggregates")); String[] keyColumns = JsonUtils.asArray(shuffleJson, "pivotKeys"); BlockSchema fullSchema = new BlockSchema(shuffleJson.get("schema")); BlockSchema valueSchema = fullSchema.getComplementSubset(keyColumns); CommonContext commonContext = new ReduceContext(context); Block input = new ContextBlock(commonContext); input.configure(shuffleJson); try { TupleOperator operator = OperatorFactory.getTupleOperator(OperatorType.GROUP_BY); Map<String, Block> inputMap = new HashMap<String, Block>(); inputMap.put("groupbyBlock", input); BlockProperties props = new BlockProperties(null, fullSchema, (BlockProperties) null); operator.setInput(inputMap, groupByJson, props); String[] valueColumns = valueSchema.getColumnNames(); Tuple tuple; Tuple key = TupleFactory.getInstance().newTuple(keyColumns.length); Tuple value = TupleFactory.getInstance().newTuple(valueColumns.length); while ((tuple = operator.next()) != null) { TupleUtils.extractTupleWithReuse(tuple, fullSchema, key, keyColumns); TupleUtils.extractTupleWithReuse(tuple, fullSchema, value, valueColumns); context.write(key, value); } } // catch this exception here and don't let it propagate to hadoop; if it does, // there is a bug in the hadoop code which just hangs the job without killing it. catch (Exception e) { e.printStackTrace(); } } public static void checkPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { PostCondition condition = preConditions.values().iterator().next(); BlockSchema inputSchema = condition.getSchema(); String[] keyColumns = JsonUtils.asArray(json, "pivotKeys"); BlockSchema outputSchema = inputSchema.getSubset(keyColumns); if (json.has("aggregates")) { for (JsonNode aggregateJson : json.path("aggregates")) { AggregationType aggType = AggregationType.valueOf(JsonUtils.getText(aggregateJson, "type")); AggregationFunction aggregator = null; aggregator = AggregationFunctions.get(aggType, aggregateJson); BlockSchema aggSchema = aggregator.outputSchema(inputSchema, aggregateJson); outputSchema = outputSchema.append(aggSchema); } } if (!inputSchema.equals(outputSchema)) throw new PreconditionException(PreconditionExceptionType.INVALID_SCHEMA, "The input and output schema for SHUFFLE must be identical." + "\n\tInput Schema: " + inputSchema + "\n\tOutputSchema: " + outputSchema); } }