SemanticAnalyzer.java example

Explorer

Cubert-master
- src
  - main
    - java
      - com
        linkedin
        cubert
        ScriptExecutor.java
        analyzer
        physical
        AggregateRewriter.java
        BlockgenLineageAnalyzer.java
        CachedFileAnalyzer.java
        ColumnLineage.java
        CountDistinctRewriter.java
        DependencyAnalyzer.java
        DescribePlan.java
        Lineage.java
        LineageGraph.java
        LineageHelper.java
        OperatorLineage.java
        OverwriteAnalyzer.java
        PhysicalPlanVisitor.java
        PhysicalPlanWalker.java
        PlanRewriteException.java
        PlanRewriter.java
        SemanticAnalyzer.java
        ShuffleRewriter.java
        SummaryRewriter.java
        VariableNameUsed.java
        app
        CmrExecutor.java
        ExecutionSpec.java
        block
        Block.java
        BlockIterator.java
        BlockProperties.java
        BlockSchema.java
        BlockUtils.java
        BlockWriter.java
        BlockgenType.java
        BufferedTupleOperatorBlock.java
        ColumnType.java
        CommonContext.java
        ContextBlock.java
        CreateBlockOperator.java
        CubertBlock.java
        DataType.java
        EmptyBlock.java
        Index.java
        IndexEntry.java
        LocalFileBlock.java
        MergedTupleCreator.java
        PivotedBlock.java
        RowPivotedBlock.java
        ShuffleBlockWriter.java
        TupleComparator.java
        TupleCreator.java
        TupleOperatorBlock.java
        TupleStoreBlock.java
        examples
        ArrayGetMinEntry.java
        ExampleEasyCubeAggregator.java
        ForceSortedOrder.java
        ListFiles.java
        Purge.java
        functions
        Constant.java
        Function.java
        FunctionTree.java
        FunctionTreeNode.java
        InputProjection.java
        LazyTuple.java
        MapProjection.java
        PigEvalFuncWrapper.java
        Projection.java
        builtin
        ArithmeticFunction.java
        BooleanFunction.java
        Case.java
        Concat.java
        FunctionFactory.java
        FunctionType.java
        IsDistinct.java
        Match.java
        Nvl.java
        SeqNo.java
        Sizeof.java
        ToByteArray.java
        Typecast.java
        UniqueId.java
        io
        BlockInputStream.java
        BlockSerializationType.java
        CachedFileReader.java
        CombinedFileRecordReader.java
        CombinedInputSplit.java
        CompactDeserializer.java
        CompactSerializer.java
        CompactWritablesDeserializer.java
        ConfigurationDiff.java
        CubertInputFormat.java
        DefaultTupleDeserializer.java
        DefaultTupleSerializer.java
        IndexCacheable.java
        MultiMapperInputFormat.java
        MultiMapperRecordReader.java
        MultiMapperSplit.java
        NeedCachedFiles.java
        SerializerUtils.java
        Storage.java
        StorageFactory.java
        TeeWriter.java
        VariableLengthEncoder.java
        avro
        AvroBlockWriter.java
        AvroFileReader.java
        AvroStorage.java
        AvroTeeWriter.java
        AvroTupleCreator.java
        PigAvroInputFormatAdaptor.java
        PigAvroOutputFormatAdaptor.java
        rubix
        RubixBlockWriter.java
        RubixConstants.java
        RubixFile.java
        RubixFileKeyData.java
        RubixInputFormat.java
        RubixInputSplit.java
        RubixMemoryBlock.java
        RubixOutputFormat.java
        RubixRecordReader.java
        RubixRecordWriter.java
        RubixStorage.java
        RubixTupleCreator.java
        shuffle
        ShuffleStorage.java
        text
        PigTextOutputFormatWrapper.java
        TextBlockWriter.java
        TextStorage.java
        TextTeeWriter.java
        TextTupleCreator.java
        virtual
        VirtualInputFormat.java
        VirtualInputSplit.java
        VirtualProgressReporter.java
        VirtualRecordReader.java
        VirtualStorage.java
        memory
        BagArrayList.java
        ColumnarTupleStore.java
        CompactHashTableBase.java
        DictEncodedArrayList.java
        DoubleArrayList.java
        FloatArrayList.java
        IntArrayList.java
        IntIterator.java
        IntSet.java
        LongArrayList.java
        LookUpTable.java
        ObjectArrayList.java
        PagedByteArray.java
        PagedByteArrayInputStream.java
        PagedByteArrayOutputStream.java
        PrimitiveTuple.java
        ReusableDataBag.java
        SegmentedArrayList.java
        ShortArrayList.java
        operator
        AggregationBuffer.java
        BlockIndexJoinOperator.java
        BlockOperator.java
        CollateVectorBlockOperator.java
        CombineOperator.java
        CubeOperator.java
        CubertCounter.java
        DictionaryDecodeOperator.java
        DictionaryEncodeOperator.java
        DictionaryRefreshMapSideOperator.java
        DictionaryRefreshReduceSideOperator.java
        DistinctOperator.java
        DuplicateOperator.java
        ExtractMedianOperator.java
        FilterOperator.java
        FlattenBagOperator.java
        FlattenOperator2.java
        GatherOperator.java
        GenerateOperator.java
        GroupByOperator.java
        HashJoinOperator.java
        LimitOperator.java
        LoadBlockFromCacheOperator.java
        LoadBlockOperator.java
        LoadBlockType.java
        MedianFlattenOperator.java
        MergeJoinOperator.java
        NullOperator.java
        OperatorFactory.java
        OperatorType.java
        PhaseContext.java
        PivotBlockOperator.java
        PostCondition.java
        PreconditionException.java
        PreconditionExceptionType.java
        RSJoinMapOperator.java
        RSJoinOperator.java
        RankOperator.java
        SortOperator.java
        TeeOperator.java
        TopNOperator.java
        TupleOperator.java
        UDAFOperatorAdapter.java
        ValidateOperator.java
        aggregate
        AbstractAggregationFunction.java
        AggregationFunction.java
        AggregationFunctions.java
        AggregationType.java
        ArrayListAggregation.java
        BitwiseORAggregation.java
        CountAggregation.java
        CountDistinctAggregation.java
        CreateArray.java
        MaxAggregation.java
        MinAggregation.java
        PresenceBitmapUDAF.java
        SumAggregation.java
        cube
        CountDistinctCubeAggregator.java
        CubeAggregator.java
        CubeDimensions.java
        DefaultCubeAggregator.java
        DefaultDupleCubeAggregator.java
        DimensionKey.java
        DupleCubeAggregator.java
        EasyCubeAggregator.java
        EasyCubeAggregatorBridge.java
        ValueAggregationType.java
        ValueAggregator.java
        ValueAggregatorFactory.java
        pig
        piggybank
        storage
        avro
        AvroSchema2Pig.java
        AvroSchemaManager.java
        AvroStorage.java
        AvroStorageInputStream.java
        AvroStorageLog.java
        AvroStorageUtils.java
        PigAvroDatumReader.java
        PigAvroDatumWriter.java
        PigAvroInputFormat.java
        PigAvroOutputFormat.java
        PigAvroRecordReader.java
        PigAvroRecordWriter.java
        PigSchema2Avro.java
        plan
        physical
        ByIndexPartitioner.java
        CompletionTasks.java
        CubertCombiner.java
        CubertMapper.java
        CubertPartitioner.java
        CubertReducer.java
        CubertStrings.java
        DependencyGraph.java
        DictionaryExecutor.java
        ExecutorService.java
        GenerateDictionary.java
        JobExecutor.java
        LoggerThread.java
        PerfProfiler.java
        PhaseExecutor.java
        PhysicalParser.java
        ThreadPoolManager.java
        utils
        AvroUtils.java
        ClassCache.java
        CodeDictionary.java
        ColumnTypeUtil.java
        CommonUtils.java
        CubertMD.java
        DateTimeUtilities.java
        DefaultSortAlgo.java
        ExecutionConfig.java
        FileCache.java
        FileSystemUtils.java
        JsonUtils.java
        MemoryStats.java
        Pair.java
        RawTupleStore.java
        RewriteUtils.java
        SchemaUtils.java
        ScriptStats.java
        SerializedTupleStore.java
        SortAlgo.java
        TupleCopier.java
        TupleStore.java
        TupleUtils.java
        print.java
  - test
    - java
      - com
        linkedin
        cubert
        io
        TestCompactSerialization.java
        memory
        TestColumnarTupleStore.java
        TestCompactHashtable.java
        TestLookUpTable.java
        TestPagedByteArray.java
        TestSegmentedArrayLists.java
        operator
        ArrayBlock.java
        SampleEasyCubeAggregatorAsUDAF.java
        TestAggregators.java
        TestBlock.java
        TestBlockCreatorOperator.java
        TestFunctionTree.java
        TestFunctions.java
        TestHashJoinOperator.java
        TestOLAPCube.java
        TestOLAPCubeCountDistinct.java
        TestOperators.java
        TestReduceJoin.java
        TestUDAF.java
        plan
        physical
        TestContext.java
        TestDependencyGraph.java
        utils
        DataGenerator.java
        TestSerializedTupleStore.java

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.analyzer.physical;

import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.functions.builtin.FunctionFactory;
import com.linkedin.cubert.io.rubix.RubixConstants;
import com.linkedin.cubert.operator.BlockOperator;
import com.linkedin.cubert.operator.OperatorFactory;
import com.linkedin.cubert.operator.OperatorType;
import com.linkedin.cubert.operator.PivotBlockOperator;
import com.linkedin.cubert.operator.PostCondition;
import com.linkedin.cubert.operator.PreconditionException;
import com.linkedin.cubert.operator.PreconditionExceptionType;
import com.linkedin.cubert.operator.TupleOperator;
import com.linkedin.cubert.plan.physical.CubertCombiner;
import com.linkedin.cubert.utils.CommonUtils;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.SchemaUtils;
import com.linkedin.cubert.utils.print;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.ObjectNode;

import static com.linkedin.cubert.utils.JsonUtils.asArray;
import static com.linkedin.cubert.utils.JsonUtils.getText;

/**
 * Sematically analyzes the cubert physical plan.
 *
 * @author Maneesh Varshney
 *
 */
public class SemanticAnalyzer extends PhysicalPlanVisitor implements PlanRewriter
{
    public static final class Node
    {
        JsonNode json;
        PostCondition condition;
        Node child;
        ArrayList<Node> parents = new ArrayList<Node>();
        Node next;

        public Node(JsonNode json, PostCondition condition)
        {
            this.json = json;
            this.condition = condition;
        }

        public void setChild(Node child)
        {
            this.child = child;
        }

        public void addParent(Node parent)
        {
            this.parents.add(parent);
        }

        @Override
        public String toString()
        {
            String str = json.toString();
            if (child != null)
            {
                str += "\n  " + child.toString();
            }
            return str;
        }

        public JsonNode getOperatorJson()
        {
            return this.json;
        }

        public PostCondition getPostCondition()
        {
            return this.condition;
        }

        public Node getNext()
        {
            // TODO Auto-generated method stub
            return next;
        }
    }

    private Node linkedListHead;
    private final Map<String, PostCondition> datasetConditions =
            new HashMap<String, PostCondition>();

    private Node lastShuffleNode;
    private boolean hasErrors = false;
    private String[] blockIndexJoinKeys;
    private boolean revisit = false;

    private final Map<String, Node> operatorMap = new HashMap<String, Node>();
    private boolean debugMode = false;

    // List of final operator node from each mapper
    // The number of entries in this list is equal to # multimappers
    List<Node> mapOutputNodes = new ArrayList<Node>();

    public Node getNodeInformation()
    {
        return linkedListHead;
    }

    @Override
    public JsonNode rewrite(JsonNode plan,
                            Set<String> namesUsed,
                            boolean debugMode,
                            boolean revisit) throws IOException
    {
        this.revisit = revisit;
        this.debugMode = debugMode;
        new PhysicalPlanWalker(plan, this).walk();
        return hasErrors ? null : plan;
    }

    @Override
    public void enterProgram(JsonNode json)
    {
        if (json.has("input") && !json.get("input").isNull())
        {
            JsonNode programInput = json.get("input");
            Iterator<String> it = programInput.getFieldNames();
            while (it.hasNext())
            {
                String input = it.next();
                JsonNode inputJson = programInput.get(input);

                BlockSchema schema = new BlockSchema(inputJson.get("schema"));

                PostCondition condition = null;

                if (inputJson.has("partitionKeys") && inputJson.has("sortKeys"))
                    condition =
                            new PostCondition(schema,
                                              JsonUtils.asArray(inputJson.get("partitionKeys")),
                                              JsonUtils.asArray(inputJson.get("sortKeys")));
                else
                    condition = new PostCondition(schema, null, null);
                datasetConditions.put(input, condition);
            }
        }
    }

    @Override
    public void enterJob(JsonNode json)
    {
        print.f("Analyzing job [%s]...", getText(json, "name"));

        if (!json.has("jobType")
                || !getText(json, "jobType").equals("GENERATE_DICTIONARY"))
        {
            int numReducers = json.get("reducers").getIntValue();
            if (numReducers < 0)
            {
                error(null,
                      "Invalid negative number of reducers in job "
                              + getText(json, "name"));
            }
            else if (numReducers == 0)
            {
                if (json.has("shuffle") && !json.get("shuffle").isNull())
                    error(null, "Job [" + getText(json, "name")
                            + "] is configured with 0 reducers, but has shuffle operator");

                if (json.has("reduce") && !json.get("reduce").isNull())
                    error(null, "Job [" + getText(json, "name")
                            + "] is configured with 0 reducers, but has reduce operators");
            }
            else
            {
                if (!json.has("shuffle") || json.get("shuffle").isNull())
                    error(null,
                          "Job ["
                                  + getText(json, "name")
                                  + "] is configured with reducers, but does not define shuffle operator");
                if (!json.has("reduce") || json.get("reduce").isNull())
                    error(null,
                          "Job ["
                                  + getText(json, "name")
                                  + "] is configured with reducers, but does not define reduce operators");
            }
        }

        lastShuffleNode = null;
        linkedListHead = null;
        blockIndexJoinKeys = null;
        operatorMap.clear();

        mapOutputNodes.clear();
    }

    @Override
    public void visitInput(JsonNode json)
    {
        JsonNode pathJson = json.get("path");
        String path;
        if (pathJson.isArray())
            path = JsonUtils.encodePath(pathJson.get(0));
        else
            path = JsonUtils.encodePath(pathJson);

        // determine the postcondition of input
        PostCondition inputCondition = datasetConditions.get(path);
        if (inputCondition == null)
        {
            error(null, pathJson, "Cannot determine schema at path " + path);
        }

        // put the schema in json
        ((ObjectNode) json).put("schema", inputCondition.getSchema().toJson());

        Node node = new Node(json, inputCondition);

        // If the linkedListHead is not null, it implies there was a previous multimapper
        // In which case, store the Node in a list for now.
        // We will validate the postconditions in visitShuffle()
        if (linkedListHead != null)
        {
            mapOutputNodes.add(linkedListHead);
        }

        linkedListHead = node;
        operatorMap.clear();
        operatorMap.put(getText(json, "name"), node);
    }

    private void printConditions(String line,
                                 Map<String, PostCondition> preConditions,
                                 PostCondition postCondition)
    {
        if (!debugMode)
            return;
        print.f("---------------------------------------------");
        print.f("%s\n", line);
        for (String block : preConditions.keySet())
        {
            PostCondition preCondition = preConditions.get(block);
            print.f("Precondition for %s", block);
            print.f("\tSchema: %s", preCondition.getSchema());
            print.f("\tPartition Keys: %s",
                    Arrays.toString(preCondition.getPartitionKeys()));
            print.f("\tSort Keys:      %s", Arrays.toString(preCondition.getSortKeys()));
            if (preCondition.getPivotKeys() != null)
                print.f("\tPivot Keys:     %s",
                        Arrays.toString(preCondition.getPivotKeys()));
        }

        print.f("\nPost Condition");
        if (postCondition == null)
        {
            print.f("\tERROR");
        }
        else
        {
            print.f("\tSchema: %s", postCondition.getSchema());
            print.f("\tPartition Keys: %s",
                    Arrays.toString(postCondition.getPartitionKeys()));
            print.f("\tSort Keys:      %s", Arrays.toString(postCondition.getSortKeys()));
            if (postCondition.getPivotKeys() != null)
                print.f("\tPivot Keys:     %s",
                        Arrays.toString(postCondition.getPivotKeys()));
        }
    }

    @Override
    public void visitOperator(JsonNode json, boolean isMapper)
    {
        Map<String, PostCondition> preConditions = null;
        PostCondition postCondition = null;

        try
        {
            String[] inputNames = new String[] {};
            if (json.has("input"))
                inputNames = JsonUtils.asArray(json.get("input"));

            preConditions = getPreconditions(inputNames, json);
            if (preConditions == null)
                return;

            postCondition = getPostCondition(json, preConditions);

            if (json.get("line") != null)
                printConditions(json.has("line") ? getText(json, "line") : null,
                                preConditions,
                                postCondition);

            if (postCondition == null)
                return;

            ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());

            // create a node for this operator
            Node node = new Node(json, postCondition);

            for (String inputName : inputNames)
            {
                Node parent = operatorMap.get(inputName);
                parent.setChild(node);
                node.addParent(parent);
            }

            operatorMap.put(getText(json, "output"), node);

            node.next = linkedListHead;
            linkedListHead = node;

            // special cases for individual operators
            OperatorType type = OperatorType.valueOf(getText(json, "operator"));
            switch (type)
            {
            case BLOCK_INDEX_JOIN:
                this.blockIndexJoinKeys = asArray(json, "partitionKeys");
                ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());
                break;
            case TEE:
                this.datasetConditions.put(getText(json, "path"), postCondition);
                ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());
                break;
            case LOAD_BLOCK:
                ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());
                if (postCondition.getPartitionKeys() != null)
                    ((ObjectNode) json).put("partitionKeys",
                                            JsonUtils.createArrayNode(postCondition.getPartitionKeys()));
                if (postCondition.getSortKeys() != null)
                    ((ObjectNode) json).put("sortKeys",
                                            JsonUtils.createArrayNode(postCondition.getSortKeys()));

                break;
            default:

                break;
            }
        }
        catch (IllegalArgumentException e)
        {
            // error(json, "operator %s is not supported.", getText(json, "operator"));
            printConditions(json.get("line").getTextValue(), preConditions, postCondition);
            error(e, json, e.getMessage());
        }
        catch (PreconditionException e)
        {
            printConditions(json.get("line").getTextValue(), preConditions, postCondition);
            error(e, json, e.toString());
        }
        catch (ArrayIndexOutOfBoundsException e)
        {
            error(e, json, e.toString());
        }
    }

    private BlockSchema createReduceSideJoinSchema(Node leftNode, Node rightNode, JsonNode shuffleJson)
    {
        // Build a "union" schema for the join
        BlockSchema leftSchema = leftNode.condition.getSchema();
        BlockSchema rightSchema = rightNode.condition.getSchema();
        String[] joinKeys = asArray(leftNode.json, "joinKeys");

        // validate the the join keys have same schema in both mappers
        for (String joinKey: joinKeys)
        {
            ColumnType leftType = leftSchema.getColumnType(leftSchema.getIndex(joinKey));
            ColumnType rightType = rightSchema.getColumnType(rightSchema.getIndex(joinKey));

            if (!leftType.equals(rightType))
            {
                error(shuffleJson, "Datatype of join key " + joinKey + " is not same. Left: "
                            + leftType + " Right: " + rightType);
            }
        }

        // validate that column names are different in left and right tables
        Set<String> joinKeySet = new HashSet<String>();
        Set<String> leftColumnSet = new HashSet<String>();
        for (String str: joinKeys)
                joinKeySet.add(str);

        for (String str: leftSchema.getColumnNames())
            leftColumnSet.add(str);

        for (String str: rightSchema.getColumnNames())
                if (leftColumnSet.contains(str) && !joinKeySet.contains(str))
                    error(shuffleJson, "The names of columns (other than join keys) must be different. Found: " + str);

        // the new schema will have #leftColumns + #rightColumn - #joinKeys + 1 (tag)
        ColumnType[] joinTypes = new ColumnType[leftSchema.getNumColumns() + rightSchema.getNumColumns() - joinKeys.length + 1];
        Set<String> joinKeysSet = new HashSet<String>();

        // fill the schema with the join keys
        int idx = 0;
        for (String joinKey: joinKeys)
        {
            joinTypes[idx++] = leftSchema.getColumnType(leftSchema.getIndex(joinKey));
            joinKeysSet.add(joinKey);
        }


        // fill the remaining columns from the left schema
        for (int i = 0; i < leftSchema.getNumColumns(); i++)
        {
            String colName = leftSchema.getName(i);
            if (joinKeysSet.contains(colName))
                continue;
            joinTypes[idx++] = leftSchema.getColumnType(i);
        }

        // fill the remaining columns from the right schema
        for (int i = 0; i < rightSchema.getNumColumns(); i++)
        {
            String colName = rightSchema.getName(i);
            if (joinKeysSet.contains(colName))
                continue;
            joinTypes[idx++] = rightSchema.getColumnType(i);
        }


        // finally, fill the schema for tag (this is LAST columns)
        joinTypes[idx++] = new ColumnType("___tag", DataType.INT);

        BlockSchema joinSchema = new BlockSchema(joinTypes);

        return joinSchema;
    }

    @Override
    public void visitShuffle(JsonNode json)
    {
        // Add the current Mapper node in the list
        mapOutputNodes.add(linkedListHead);

        // if there are more than one mappers (multi-mapper situation)
        if (mapOutputNodes.size() > 1)
        {
            // special case: REDUCE-SIDE join
            if (json.has("join") && json.get("join").getBooleanValue())
            {
                // Build a "union" schema for the join
                Node leftNode = mapOutputNodes.get(0);
                Node rightNode = mapOutputNodes.get(1);

                BlockSchema joinSchema = createReduceSideJoinSchema(leftNode, rightNode, json);

                // Assign this schema to both mappers
                leftNode.condition = new PostCondition(joinSchema,
                                                       leftNode.condition.getPartitionKeys(),
                                                       leftNode.condition.getSortKeys());

                rightNode.condition = new PostCondition(joinSchema,
                                                        rightNode.condition.getPartitionKeys(),
                                                        rightNode.condition.getSortKeys());

                // set this joined schema in json for both operators
                ((ObjectNode) leftNode.json).put("schema", joinSchema.toJson());
                ((ObjectNode) rightNode.json).put("schema", joinSchema.toJson());
            }

            // validate that we are shuffling on the same name from each multi mapper
            // Also validate that the schema is identical (with one exception: ReduceSideJoin)
            String outputName = getText(mapOutputNodes.get(0).json, "output");
            BlockSchema firstSchema = mapOutputNodes.get(0).getPostCondition().getSchema();
            BlockSchema widerSchema = firstSchema;

            for (int i = 1; i < mapOutputNodes.size(); i++)
            {
                String name = getText(mapOutputNodes.get(i).json, "output");
                BlockSchema schema = mapOutputNodes.get(i).getPostCondition().getSchema();

                if (!name.equals(outputName))
                    error(json, "Multimappers must shuffle on same block names. Found: " + name + " and " + outputName);

                if (!firstSchema.equalsIgnoreNumeric(schema))
                    error(json, "Multimappers must have same output schema. Found:\n\t"
                                + firstSchema + " and\n\t" + schema);

                widerSchema = SchemaUtils.getWiderSchema(widerSchema, schema);
            }

            // update the condition for each multimapper with wider type
            for (int i = 0; i < mapOutputNodes.size(); i++)
            {

                PostCondition condition = mapOutputNodes.get(i).condition;
                mapOutputNodes.get(i).condition = new PostCondition(widerSchema,
                                                                    condition.getPartitionKeys(),
                                                                    condition.getSortKeys());
            }

        }


        String[] inputNames = asArray(json.get("name"));
        Map<String, PostCondition> preConditions = getPreconditions(inputNames, json);
        if (preConditions == null)
            return;

        PostCondition preCondition = preConditions.values().iterator().next();

        if (json.has("distinctShuffle") && json.get("distinctShuffle").getBooleanValue())
        {
            String[] columns = preCondition.getSchema().getColumnNames();
            JsonNode columnsJson = JsonUtils.createArrayNode(columns);
            ((ObjectNode) json).put("partitionKeys", columnsJson);
            ((ObjectNode) json).put("pivotKeys", columnsJson);
        }

        String[] partitionKeys = asArray(json, "partitionKeys");
        String[] pivotKeys = asArray(json, "pivotKeys");

        // make sure partition and pivot keys are in the schema
        if (partitionKeys != null)
        {
            checkShuffleKeyErrors(json, preCondition.getSchema(), partitionKeys);
        }

        if (pivotKeys != null)
        {
            checkShuffleKeyErrors(json, preCondition.getSchema(), pivotKeys);
        }

        // if there is a distinct operator in reduce, add remaining columns in pivot keys
        if (json.has("distinct") && json.get("distinct").getBooleanValue())
        {
            BlockSchema remaining =
                    preCondition.getSchema().getComplementSubset(pivotKeys);
            String[] completePivotKeys =
                    CommonUtils.concat(pivotKeys, remaining.getColumnNames());
            ((ObjectNode) json).put("pivotKeys",
                                    JsonUtils.createArrayNode(completePivotKeys));
            pivotKeys = asArray(json, "pivotKeys");
        }

        if (json.has("aggregates"))
        {
            try
            {
                CubertCombiner.checkPostCondition(preConditions, json);
            }
            catch (PreconditionException e)
            {
                error(json, e.getMessage());
            }
        }

        PostCondition postCondition =
                new PostCondition(preCondition.getSchema(), partitionKeys, pivotKeys);

        // put the schema in the json
        ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());

        // create a node for this operator
        Node shuffleNode = new Node(json, postCondition);

        for (String inputName : inputNames)
        {
            Node parent = operatorMap.get(inputName);
            parent.setChild(shuffleNode);
            shuffleNode.addParent(parent);
        }

        // validate that all operators are used once

        for (Node node : mapOutputNodes)
        {
            node.setChild(shuffleNode);

            while (node != null)
            {
                if (node.child == null)
                    error(node.json, "The output of this command is not used");
                node = node.next;
            }
        }

        // start a new chain with shuffle as the input block
        operatorMap.clear();
        linkedListHead = shuffleNode;
        operatorMap.put(getText(json, "name"), shuffleNode);
    }

    private void checkShuffleKeyErrors(JsonNode json, BlockSchema schema, String[] keys)
    {
        final Map<String, Integer> schemaIndexMap = schema.getIndexMap();

        HashSet<String> cols = new HashSet<String>();
        for (String key : keys)
        {
            if (!schemaIndexMap.containsKey(key))
            {
                error(json, "Column " + key + " not found in schema:\n\t" + schema);
            }

            if (cols.contains(key))
            {
                error(json, "Duplicate key specified: " + key);
            }

            cols.add(key);
        }
    }

    @Override
    public void exitJob(JsonNode json)
    {
        JsonNode outputJson = json.get("output");
        boolean isRubixStorage =
                outputJson.has("type")
                        && getText(outputJson, "type").equalsIgnoreCase("RUBIX");

        // if the output type is cubert, we expect a metadata node in the json
        if (isRubixStorage)
        {
            // the metadata has the following fields
            JsonNode metadataNode =
                    JsonUtils.createObjectNode("schema",
                                               outputJson.get("schema"),
                                               "partitionKeys",
                                               outputJson.get("partitionKeys"),
                                               "sortKeys",
                                               outputJson.get("sortKeys"),
                                               "version",
                                               RubixConstants.RUBIX_FILE_VERSION_NUMBER,
                                               "BlockFormat",
                                               "DefaultRubixFormat",
                                               "BlockgenId",
                                               getText(outputJson, "blockgenId"));

            if (!json.has("metadata") || revisit)
                ((ObjectNode) json).put("metadata", metadataNode);
            else
                throw new RuntimeException(String.format("The metadata node is already added for the output of job %s, which is not expected.",
                                                         json.get("name").toString()));
        }
    }

    @Override
    public void visitOutput(JsonNode json)
    {
        String[] inputNames = asArray(json.get("name"));
        Map<String, PostCondition> preConditions = getPreconditions(inputNames, json);
        if (preConditions == null)
            return;

        PostCondition preCondition = preConditions.values().iterator().next();
        PostCondition postCondition = preCondition;

        boolean isRubixStorage =
                json.has("type") && getText(json, "type").equalsIgnoreCase("RUBIX");
        // Only rubix storage allows partition and sort keys
        if (!isRubixStorage)
        {
            postCondition = new PostCondition(postCondition.getSchema(), null, null);
        }

        // put the schema in the json
        ((ObjectNode) json).put("schema", postCondition.getSchema().toJson());

        if (postCondition.getPartitionKeys() != null)
            ((ObjectNode) json).put("partitionKeys",
                                    JsonUtils.createArrayNode(postCondition.getPartitionKeys()));
        if (postCondition.getSortKeys() != null)
            ((ObjectNode) json).put("sortKeys",
                                    JsonUtils.createArrayNode(postCondition.getSortKeys()));

        Node node = new Node(json, postCondition);

        for (String inputName : inputNames)
        {
            Node parent = operatorMap.get(inputName);
            parent.setChild(node);
            node.addParent(parent);
        }

        // validate that all operators are used once
        node = linkedListHead;
        while (node != null)
        {
            if (node.child == null)
                error(node.json, "The output of this command is not used");
            node = node.next;
        }

        datasetConditions.put(getText(json, "path"), postCondition);
    }

    private void error(JsonNode json, String format, Object... args)
    {
        error(null, json, format, args);
    }

    private void error(Exception e, JsonNode json, String format, Object... args)
    {
        if (debugMode && e != null)
            e.printStackTrace();

        hasErrors = true;
        System.err.println(String.format("ERROR: " + format, args));
        if (json != null)
        {
            System.err.print("At:\t");
            if (json.has("line"))
                System.err.println(json.get("line").getTextValue());
            else
                System.err.println(json.toString());

        }
        if (e != null)
            throw new PlanRewriteException(e);
        else
            throw new PlanRewriteException();
    }

    private Map<String, PostCondition> getPreconditions(String[] inputNames, JsonNode json)
    {
        Map<String, PostCondition> preConditions = new HashMap<String, PostCondition>();

        // verify that all inputs are present, and put them in the preConditions map
        for (String inputName : inputNames)
        {
            Node parent = operatorMap.get(inputName);
            if (parent == null)
            {
                error(json, "input block %s not found", inputName);
                continue;
            }

            if (parent.child != null)
            {
                // Exception 1: it is okay for the parent to have multiple child,
                // IF all other children are LOAD_BLOCK (which were using "MATCHING"
                // keyword to reference the parents)
                boolean isException = false;
                if (json.has("operator")
                        && getText(json, "operator").equals("LOAD_BLOCK"))
                    isException = true;

                if (parent.child.json.has("operator")
                        && getText(parent.child.json, "operator").equals("LOAD_BLOCK"))
                    isException = true;

                // Exception 2: it is okay for the parent to have multiple child,
                // IF the parent is an IN MEMORY PIVOT operator
                if (parent.json.has("operator")
                        && getText(parent.json, "operator").equals("PIVOT_BLOCK")
                        && parent.json.has("inMemory")
                        && parent.json.get("inMemory").getBooleanValue())
                    isException = true;

                // Exception 3 (for now): it is okay for parent to have multiple child,
                // if the other children are USER_DEFINED_BLOCK_OPERATOR
                if ((json.has("operator") && getText(json, "operator").equals("USER_DEFINED_BLOCK_OPERATOR"))
                        || (parent.child.json.has("operator") && getText(parent.child.json,
                                                                         "operator").equals("USER_DEFINED_BLOCK_OPERATOR")))
                    isException = true;

                if (!isException)
                {
                    error(json,
                          "parent operator [%s] has a child assigned already.\n\tChild: %s",
                          parent.json.get("line"),
                          parent.child.json.get("line"));
                    continue;
                }
            }

            preConditions.put(inputName, parent.condition);
        }

        // bail out, if we haven't accumulated all preconditions
        if (preConditions.size() != inputNames.length)
            return null;

        return preConditions;
    }

    private PostCondition getPostCondition(JsonNode json,
                                           Map<String, PostCondition> preConditions) throws PreconditionException
    {
        OperatorType type = OperatorType.valueOf(getText(json, "operator"));

        if (type.isTupleOperator())
        {
            // TupleOperator operatorObject = OperatorFactory.getTupleOperator(type);
            TupleOperator operatorObject =
                    (type == OperatorType.USER_DEFINED_TUPLE_OPERATOR)
                            ? (TupleOperator) FunctionFactory.createFunctionObject(getText(json,
                                                                                           "class"),
                                                                                   json.get("constructorArgs"))
                            : OperatorFactory.getTupleOperator(type);

            return operatorObject.getPostCondition(preConditions, json);
        }
        else
        {
            switch (type)
            {
            case CREATE_BLOCK:
            {
                String[] partitionKeys = JsonUtils.asArray(json, "partitionKeys");
                PostCondition condition = preConditions.values().iterator().next();
                BlockSchema schema = condition.getSchema();
                String[] sortKeys = condition.getSortKeys();
                if (json.has("pivotKeys"))
                    sortKeys = JsonUtils.asArray(json, "pivotKeys");

                if (!CommonUtils.isPrefix(partitionKeys, condition.getPartitionKeys())
                        && !CommonUtils.isPrefix(condition.getPartitionKeys(),
                                                 partitionKeys))
                    throw new PreconditionException(PreconditionExceptionType.INVALID_PARTITION_KEYS);

                boolean isIndexed = getText(json, "blockgenType").equals("BY_INDEX");

                if (isIndexed)
                {
                    partitionKeys = blockIndexJoinKeys;
                    // remove BLOCL_ID from schema
                    ColumnType[] ctypes = new ColumnType[schema.getNumColumns() - 1];
                    int idx = 0;
                    for (int i = 0; i < schema.getNumColumns(); i++)
                        if (!schema.getColumnType(i).getName().equals("BLOCK_ID"))
                            ctypes[idx++] = schema.getColumnType(i);

                    schema = new BlockSchema(ctypes);

                    // remove BLOCK_ID from sort keys
                    if (sortKeys.length == 0)
                        System.out.println("Empty sortKeys for opNode " + json.toString());
                    String[] tmp = new String[sortKeys.length - 1];

                    idx = 0;
                    for (String key : sortKeys)
                        if (!key.equals("BLOCK_ID"))
                            tmp[idx++] = key;
                    sortKeys = tmp;
                }

                return new PostCondition(schema, partitionKeys, sortKeys);
            }
            case LOAD_BLOCK:
            {
                if (json.get("path") == null)
                    System.out.println("null path name for " + json);
                String path = JsonUtils.encodePath(json.get("path"));
                PostCondition postCondition = this.datasetConditions.get(path);
                if (postCondition == null)
                    error(json, "Cannot determine schema of " + path);
                return postCondition;
            }
            case LOAD_CACHED_FILE:
            {
                String path = JsonUtils.encodePath(json.get("path"));
                try
                {
                    path = new URI(path).getPath();
                }
                catch (URISyntaxException e)
                {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                PostCondition postCondition = this.datasetConditions.get(path);
                if (postCondition == null)
                    error(json, "Cannot determine schema of " + path);

                return new PostCondition(postCondition.getSchema(),
                                         new String[] {},
                                         new String[] {});
            }
            case PIVOT_BLOCK:
            {
                BlockOperator operator = new PivotBlockOperator();
                return operator.getPostCondition(preConditions, json);
            }
            case COLLATE_VECTOR_BLOCK:
            {
                PostCondition preCondition = preConditions.values().iterator().next();
                String metaRelationName =
                        new String(JsonUtils.getText(json, "metaRelationName"));

                BlockSchema outSchema = new BlockSchema(preCondition.getSchema());

                PostCondition preConditionMetaRelation =
                        preConditions.get(metaRelationName);
                String identifierColumns[] = JsonUtils.asArray(json, "identifierColumn");

                // This will allow a subsequent GROUP BY on (identifierColumns,
                // lookupColumns)
                if (!CommonUtils.isPrefix(preConditionMetaRelation.getSortKeys(),
                                          identifierColumns))
                {
                    System.out.println("PreconditionMetaRelation sortKeys "
                            + Arrays.toString(preConditionMetaRelation.getSortKeys()));
                    throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS);
                }

                String[] combineColumns = JsonUtils.asArray(json, "combineColumns");
                String[] lookupColumns = JsonUtils.asArray(json, "lookupColumn");
                if (!CommonUtils.isPrefix(preCondition.getSortKeys(),
                                          CommonUtils.concat(lookupColumns,
                                                             combineColumns)))
                {
                    System.out.println("PreconditionInputBlock sortKeys "
                            + Arrays.toString(preCondition.getSortKeys()));
                    throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS);
                }

                int identifierColumnIndex =
                        preConditionMetaRelation.getSchema()
                                                .getIndex(getText(json,
                                                                  "identifierColumn"));
                ColumnType ct =
                        preConditionMetaRelation.getSchema()
                                                .getColumnType(identifierColumnIndex);

                // rename identifierColumn for output schema.
                identifierColumns[0] =
                        String.format("%s___%s",
                                      getText(json, "metaRelationName"),
                                      getText(json, "identifierColumn"));
                ct.setName(identifierColumns[0]);
                BlockSchema identifierColumnSchema =
                        new BlockSchema(new ColumnType[] { ct });
                /*
                 * String newColumn = String.format("STRING %s___%s",
                 * CommonUtils.stripQuotes(getText(json, "metaRelationName")),
                 * CommonUtils.stripQuotes(getText(json, "identifierColumn")));
                 * BlockSchema newColumnSchema = new BlockSchema(newColumn);
                 */

                // Only the metaRelation identifier column is renamed. Columns from the
                // "data/metrics" block are transmitted with their old names.
                outSchema = outSchema.append(identifierColumnSchema);

                return new PostCondition(outSchema,
                                         preCondition.getPartitionKeys(),
                                         (combineColumns),
                                         identifierColumns);
            }
            case USER_DEFINED_BLOCK_OPERATOR:
            {
                BlockOperator operator =
                        (BlockOperator) FunctionFactory.createFunctionObject(getText(json,
                                                                                     "class"),
                                                                             json.get("constructorArgs"));
                // OperatorFactory.getUserDefinedBlockOperator(getText(json, "class"),
                // json.get("constructorArgs"));
                return operator.getPostCondition(preConditions, json);

            }
            default:
                throw new IllegalArgumentException("Operator " + type
                        + " is not supported.");
            }
        }
    }
}