RuleMatcher.java example

Explorer
hadoop-pig-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.plan.optimizer;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import org.apache.pig.PigException;
import org.apache.pig.impl.plan.Operator;
import org.apache.pig.impl.plan.OperatorPlan;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.plan.optimizer.RuleOperator.NodeType;
import org.apache.pig.impl.util.Pair;

/**
 * RuleMatcher contains the logic to determine whether a given rule matches.
 * This alone does not mean the rule will be applied.  Transformer.check()
 * still has to pass before Transfomer.transform() is called. 
 *
 */

public class RuleMatcher<O extends Operator, P extends OperatorPlan<O>> {

    private Rule<O, P> mRule;
    private List<Pair<O, RuleOperator.NodeType>> mMatch;
    private List<List<Pair<O, RuleOperator.NodeType>>> mPrelimMatches = new ArrayList<List<Pair<O, RuleOperator.NodeType>>>();
    private List<List<O>> mMatches = new ArrayList<List<O>>();
    private P mPlan; // for convenience.
    private int mNumCommonNodes = 0;

    /**
     * Test a rule to see if it matches the current plan. Save all matched nodes using BFS
     * @param rule Rule to test for a match.
     * @return true if the plan matches.
     */
    public boolean match(Rule<O, P> rule) throws OptimizerException {
        mRule = rule;
        CommonNodeFinder commonNodeFinder = new CommonNodeFinder(mRule.getPlan());
        try {
            commonNodeFinder.visit();
            mNumCommonNodes = commonNodeFinder.getCount();
        } catch (VisitorException ve) {
            int errCode = 2125;
            String msg = "Internal error. Problem in computing common nodes in the Rule Plan.";
            throw new OptimizerException(msg, errCode, PigException.BUG, ve);
        }
        mPlan = mRule.getTransformer().getPlan();
        mMatches.clear();
        mPrelimMatches.clear();
        
        if (mRule.getWalkerAlgo() == Rule.WalkerAlgo.DependencyOrderWalker)
        	DependencyOrderWalker();
        else if (mRule.getWalkerAlgo() == Rule.WalkerAlgo.DepthFirstWalker)
        	DepthFirstWalker();
        else if (mRule.getWalkerAlgo() == Rule.WalkerAlgo.ReverseDependencyOrderWalker)
            ReverseDependencyOrderWalker();
        
        return (mMatches.size()!=0);
    }
    
    private void ReverseDependencyOrderWalker()
    {
        List<O> fifo = new ArrayList<O>();
        Set<O> seen = new HashSet<O>();
        List<O> roots = mPlan.getRoots();
        if (roots == null) return;
        for (O op : roots) {
            RDODoAllSuccessors(op, seen, fifo);
        }

        for (O op: fifo) {
            if (beginMatch(op))
            mPrelimMatches.add(mMatch);
        }
        
        if(mPrelimMatches.size() > 0) {
            processPreliminaryMatches();
        }
    }
        
    private void RDODoAllSuccessors(O node, Set<O> seen, Collection<O> fifo)
    {
        if (!seen.contains(node)) {
        // We haven't seen this one before.
             Collection<O> succs = mPlan.getSuccessors(node);
             if (succs != null && succs.size() > 0) {
                 // Do all our successors before ourself
                 for (O op : succs) {
                     RDODoAllSuccessors(op, seen, fifo);
                 }
             }
             // Now do ourself
             seen.add(node);
             fifo.add(node);
         }
    }
    
    private void DependencyOrderWalker()
    {
        List<O> fifo = new ArrayList<O>();
        Set<O> seen = new HashSet<O>();
        List<O> leaves = mPlan.getLeaves();
        if (leaves == null) return;
        for (O op : leaves) {
        	BFSDoAllPredecessors(op, seen, fifo);
        }

        for (O op: fifo) {
        	if (beginMatch(op))
			{
        		mPrelimMatches.add(mMatch);
			}
        }
        
        if(mPrelimMatches.size() > 0) {
            processPreliminaryMatches();
        }
    }
    
    /**
     * A method to compute the final matches
     */
    private void processPreliminaryMatches() {
        //The preliminary matches contain paths that match
        //the specification in the RulePlan. However, if there
        //are twigs and DAGs, then a further computation is required
        //to extract the nodes in the mPlan that correspond to the
        //roots of the RulePlan
        
        //compute the number of common nodes in each preliminary match
        
        List<List<O>> commonNodesPerMatch = new ArrayList<List<O>>();
        for(int i = 0; i < mPrelimMatches.size(); ++i) {
            commonNodesPerMatch.add(getCommonNodesFromMatch(mPrelimMatches.get(i)));
        }
        
        if(mNumCommonNodes == 0) {
            //the rule plan had simple paths
            
            //verification step
            //if any of the preliminary matches had common nodes 
            //then its an anomaly
            
            for(int i = 0; i < commonNodesPerMatch.size(); ++i) {
                if(commonNodesPerMatch.get(i) != null) {
                    //we have found common nodes when there should be none
                    //just return as mMatches will be empty
                    return;
                }
            }
            
            //pick the first node of each match and put them into individual lists
            //put the lists inside the list of lists mMatches
            
            for(int i = 0; i < mPrelimMatches.size(); ++i) {
                List<O> match = new ArrayList<O>();
                match.add(mPrelimMatches.get(i).get(0).first);
                mMatches.add(match);
            }
            //all the matches have been computed for the simple path
            return;
        } else {
            for(int i = 0; i < commonNodesPerMatch.size(); ++i) {
                int commonNodes = (commonNodesPerMatch.get(i) == null? 0 : commonNodesPerMatch.get(i).size());
                if(commonNodes != mNumCommonNodes) {
                    //if there are is a mismatch in the common nodes then we have a problem
                    //the rule plan states that we have mNumCommonNodes but we have commonNodes 
                    //in the match. Just return
                    
                    return;
                }
            }
        }
        
        //keep track of the matches that have been processed
        List<Boolean> processedMatches = new ArrayList<Boolean>();
        for(int i = 0; i < mPrelimMatches.size(); ++i) {
            processedMatches.add(false);
        }
        
        //a do while loop to handle single matches
        int outerIndex = 0;
        do {
            
            if(processedMatches.get(outerIndex)) {
               ++outerIndex;
               continue;
            }
            
            List<Pair<O, RuleOperator.NodeType>> outerMatch = mPrelimMatches.get(outerIndex);
            List<O> outerCommonNodes = commonNodesPerMatch.get(outerIndex);
            Set<O> outerSetCommonNodes = new HashSet<O>(outerCommonNodes);
            Set<O> finalIntersection = new HashSet<O>(outerCommonNodes);
            Set<O> cumulativeIntersection = new HashSet<O>(outerCommonNodes);
            List<O> patternMatchingRoots = new ArrayList<O>();
            Set<O> unionOfRoots = new HashSet<O>();
            boolean innerMatchProcessed = false;
            unionOfRoots.add(outerMatch.get(0).first);
            
            
            for(int innerIndex = outerIndex + 1; 
                (innerIndex < mPrelimMatches.size()) && (!processedMatches.get(innerIndex)); 
                ++innerIndex) {
                List<Pair<O, RuleOperator.NodeType>> innerMatch = mPrelimMatches.get(innerIndex);
                List<O> innerCommonNodes = commonNodesPerMatch.get(innerIndex);
                Set<O> innerSetCommonNodes = new HashSet<O>(innerCommonNodes);
                
                //we need to compute the intersection of the common nodes
                //the size of the intersection should be equal to the number
                //of common nodes and the type of each rule node class
                //if there is no match then it could be that we hit a match
                //for a different path, i.e., another pattern that matched
                //with a different set of nodes. In this case, we mark this
                //match as not processed and move onto the next match
                
                outerSetCommonNodes.retainAll(innerSetCommonNodes);
                
                if(outerSetCommonNodes.size() != mNumCommonNodes) {
                    //there was no match
                    //continue to the next match
                    continue;
                } else {
                    Set<O> tempCumulativeIntersection = new HashSet<O>(cumulativeIntersection);
                    tempCumulativeIntersection.retainAll(outerSetCommonNodes);
                    if(tempCumulativeIntersection.size() != mNumCommonNodes) {
                        //problem - there was a set intersection with a size mismatch
                        //between the cumulative intersection and the intersection of the
                        //inner and outer common nodes 
                        //set mMatches to empty and return
                        mMatches = new ArrayList<List<O>>();
                        return;
                    } else {
                        processedMatches.set(innerIndex, true);
                        innerMatchProcessed = true;
                        cumulativeIntersection = tempCumulativeIntersection;
                        unionOfRoots.add(innerMatch.get(0).first);
                    }
                }
            }
            
            cumulativeIntersection.retainAll(finalIntersection);
            if(cumulativeIntersection.size() != mNumCommonNodes) {
                //the cumulative and final intersections did not intersect
                //this could happen when each of the matches are disjoint
                //check if the innerMatches were processed at all
                if(innerMatchProcessed) {
                    //problem - the inner matches were processed and we did
                    //not find common intersections
                    mMatches = new ArrayList<List<O>>();
                    return;
                }
            }
            processedMatches.set(outerIndex, true);
            for(O node: unionOfRoots) {
                patternMatchingRoots.add(node);
            }
            mMatches.add(patternMatchingRoots);
            ++outerIndex;
        } while (outerIndex < mPrelimMatches.size() - 1);        
    }

    private List<O> getCommonNodesFromMatch(List<Pair<O, NodeType>> match) {
        List<O> commonNodes = null;
        //A lookup table to weed out duplicates
        Map<O, Boolean> lookup = new HashMap<O, Boolean>();
        for(int index = 0; index < match.size(); ++index) {
            if(match.get(index).second.equals(RuleOperator.NodeType.COMMON_NODE)) {
                if(commonNodes == null) {
                    commonNodes = new ArrayList<O>();
                }
                O node = match.get(index).first;
                //lookup the node under question
                //if the node is not found in the table
                //then we are examining it for the first time
                //add it to the output list and mark it as seen
                //else continue to the next iteration
                if(lookup.get(node) == null) {
                    commonNodes.add(node);
                    lookup.put(node, true);
                }
            }
        }
        return commonNodes;
    }

    private void BFSDoAllPredecessors(O node, Set<O> seen, Collection<O> fifo)  {
		if (!seen.contains(node)) {
		// We haven't seen this one before.
		Collection<O> preds = mPlan.getPredecessors(node);
		if (preds != null && preds.size() > 0) {
		// Do all our predecessors before ourself
			for (O op : preds) {
				BFSDoAllPredecessors(op, seen, fifo);
			}
		}
		// Now do ourself
		seen.add(node);
		fifo.add(node);
		}
    }
    
    private void DepthFirstWalker()
    {
    	Set<O> seen = new HashSet<O>();
        DFSVisit(null, mPlan.getRoots(), seen);
    }
    
    private void DFSVisit(O node, Collection<O> successors,Set<O> seen)
    {
        if (successors == null) return;
        for (O suc : successors) {
            if (seen.add(suc)) {
            	if (beginMatch(suc))
            		mPrelimMatches.add(mMatch);
                Collection<O> newSuccessors = mPlan.getSuccessors(suc);
                DFSVisit(suc, newSuccessors, seen);
            }
        }
    }

    /**
     * @return first occurrence of matched list of nodes that with the instances of nodes that matched the
     * pattern defined by
     * the rule.  The nodes will be in the vector in the order they are
     * specified in the rule.
     */
    List<O> getMatches() {
        if (mMatches.size()>=1)
            return mMatches.get(0);
        return null;

    }

    /**
     * @return all occurrences of matches. lists of nodes that with the instances of nodes that matched the
     * pattern defined by
     * the rule.  The nodes will be in the vector in the order they are
     * specified in the rule.
     */
    public List<List<O>> getAllMatches() {
        return mMatches;
    }

    /*
     * This pattern matching is fairly simple and makes some important
     * assumptions.
     * 1)  The pattern to be matched must be expressible as a graph.
     * 2)  The pattern must always begin with one of the root nodes in the rule plan.
     *     After that it can go where it wants.
     *
     */
    private boolean beginMatch(O node) {
        if (node == null) return false;
        
        mMatch = new ArrayList<Pair<O, RuleOperator.NodeType>>();
        
        List<O> nodeSuccessors;
        List<RuleOperator> ruleRoots = mRule.getPlan().getRoots();
        for(RuleOperator ruleRoot: ruleRoots) {
            if (node.getClass().getName().equals(ruleRoot.getNodeClass().getName()) || 
                    ruleRoot.getNodeType().equals(RuleOperator.NodeType.ANY_NODE)) {
                mMatch.add(new Pair<O, RuleOperator.NodeType>(node, ruleRoot.getNodeType()));
                // Follow the edge to see the next node we should be looking for.
                List<RuleOperator> ruleRootSuccessors = mRule.getPlan().getSuccessors(ruleRoot);
                if (ruleRootSuccessors == null) {
                    // This was looking for a single node
                    return true;
                }
                nodeSuccessors = mPlan.getSuccessors(node);
                if ((nodeSuccessors == null) || (nodeSuccessors.size() != ruleRootSuccessors.size())) {
                    //the ruleRoot has successors but the node does not
                    //OR
                    //the number of successors for the ruleRoot does not match 
                    //the number of successors for the node
                    return false; 
                }
                boolean foundMatch = false;
                for (O nodeSuccessor : nodeSuccessors) {
                    foundMatch |= continueMatch(nodeSuccessor, ruleRootSuccessors);
                }
                return foundMatch;
            }
        }
        // If we get here we haven't found it.
        return false;
    }

    private boolean continueMatch(O node, List<RuleOperator> ruleOperators) {
        for(RuleOperator ruleOperator: ruleOperators) {
            if (node.getClass().getName().equals(ruleOperator.getNodeClass().getName()) || 
                    ruleOperator.getNodeType().equals(RuleOperator.NodeType.ANY_NODE)) {
                mMatch.add(new Pair<O, RuleOperator.NodeType>(node,ruleOperator.getNodeType()));
    
                // Follow the edge to see the next node we should be looking for.
                List<RuleOperator> ruleOperatorSuccessors = mRule.getPlan().getSuccessors(ruleOperator);
                if (ruleOperatorSuccessors == null) {
                    // We've completed the match
                    return true;
                }
                List<O> nodeSuccessors;
                nodeSuccessors = mPlan.getSuccessors(node);
                if ((nodeSuccessors == null) || 
                        (nodeSuccessors.size() != ruleOperatorSuccessors.size())) {
                    //the ruleOperator has successors but the node does not
                    //OR
                    //the number of successors for the ruleOperator does not match 
                    //the number of successors for the node
                    return false;
                }
                boolean foundMatch = false;
                for (O nodeSuccessor : nodeSuccessors) {
                    foundMatch |= continueMatch(nodeSuccessor, ruleOperatorSuccessors);
                }
                return foundMatch;
            }
    
            // We can arrive here either because we didn't match at this node or
            // further down the line.  One way or another we need to remove ourselves
            // from the match vector and return false.
            //SMS - I don't think we need this as mMatch will be discarded anyway
            //mMatch.set(nodeNumber, null);
            return false;
        }
        return false;
    }

}