/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.pen.util; import java.util.*; import org.apache.pig.data.Tuple; public class LineageTracer { // Use textbook Union-Find data structure, with counts associated with items // note: we test for equality by comparing tuple references, not by calling // the "equals()" method // the "IdentityHashMap" data structure is based on reference equality IdentityHashMap<Tuple, Tuple> parents = new IdentityHashMap<Tuple, Tuple>(); IdentityHashMap<Tuple, Integer> counts = new IdentityHashMap<Tuple, Integer>(); // has // one // entry // per // unique // tuple // being // tracked IdentityHashMap<Tuple, Integer> ranks = new IdentityHashMap<Tuple, Integer>(); // insert a new tuple (if a tuple is inserted multiple times, it gets a // count > 1) public void insert(Tuple t) { if (parents.containsKey(t)) { counts.put(t, counts.get(t) + 1); } else { parents.put(t, t); counts.put(t, 1); ranks.put(t, 0); } } // union two tuple sets public void union(Tuple t1, Tuple t2) { link(getRepresentative(t1), getRepresentative(t2)); } // find the set representative of a given tuple public Tuple getRepresentative(Tuple t) { Tuple tParent = parents.get(t); if (tParent != t) { tParent = getRepresentative(tParent); parents.put(t, tParent); } return tParent; } private void link(Tuple t1, Tuple t2) { int t1Rank = ranks.get(t1); int t2Rank = ranks.get(t2); if (t1Rank > t2Rank) { parents.put(t2, t1); } else { parents.put(t1, t2); if (t1Rank == t2Rank) ranks.put(t2, t2Rank + 1); } } // get the cardinality of each tuple set (identified by a representative // tuple) public IdentityHashMap<Tuple, Double> getCounts() { return getWeightedCounts(2f, 1f); } // get the cardinality of each tuple set, weighted in a special way // weighting works like this: if a tuple set contains one or more tuples // from the "specialTuples" set, we multiply its value by "multiplier" // public IdentityHashMap<Tuple, Integer> // getWeightedCounts(IdentityHashSet<Tuple> specialTuples, int multiplier) { // IdentityHashMap<Tuple, Integer> repCounts = new IdentityHashMap<Tuple, // Integer>(); // IdentityHashSet<Tuple> specialSets = new IdentityHashSet<Tuple>(); // // for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) { // Tuple t = e.getKey(); // // int newCount = counts.get(t); // Tuple rep = getRepresentative(t); // int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0; // repCounts.put(rep, oldCount + newCount); // if (specialTuples.contains(t)) specialSets.add(rep); // } // // for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet()) { // if (specialSets.contains(e.getKey())) e.setValue(e.getValue() * // multiplier); // } // // return repCounts; // } public IdentityHashMap<Tuple, Double> getWeightedCounts( float syntheticMultipler, float omittableMultiplier) { IdentityHashMap<Tuple, Double> repCounts = new IdentityHashMap<Tuple, Double>(); for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) { Tuple t = e.getKey(); float newCount = counts.get(t); if (((ExampleTuple) t).synthetic) newCount = newCount * syntheticMultipler; if (((ExampleTuple) t).omittable) newCount = newCount * omittableMultiplier; Tuple rep = getRepresentative(t); double oldCount = (repCounts.containsKey(rep)) ? repCounts.get(rep) : 0; repCounts.put(rep, oldCount + newCount); // if (specialTuples.contains(t)) specialSets.add(rep); } /* * for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet()) * { if (specialSets.contains(e.getKey())) e.setValue(e.getValue() * multiplier); } */ return repCounts; } // get all members of the set containing t public Collection<Tuple> getMembers(Tuple t) { Tuple representative = getRepresentative(t); Collection<Tuple> members = new LinkedList<Tuple>(); for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) { Tuple t1 = e.getKey(); if (getRepresentative(t1) == representative) members.add(t1); } return members; } // get a mapping from set representatives to members public IdentityHashMap<Tuple, Collection<Tuple>> getMembershipMap() { IdentityHashMap<Tuple, Collection<Tuple>> map = new IdentityHashMap<Tuple, Collection<Tuple>>(); for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) { Tuple t = e.getKey(); Tuple representative = getRepresentative(t); Collection<Tuple> members = map.get(representative); if (members == null) { members = new LinkedList<Tuple>(); map.put(representative, members); } members.add(t); } return map; } }