/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.test;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import junit.framework.TestCase;
import org.apache.pig.ExecType;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.OperatorPlan;
import org.apache.pig.newplan.logical.LogicalPlanMigrationVistor;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.newplan.logical.optimizer.SchemaResetter;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator;
import org.apache.pig.newplan.logical.rules.AddForEach;
import org.apache.pig.newplan.logical.rules.ColumnMapKeyPrune;
import org.apache.pig.newplan.logical.rules.MapKeysPruneHelper;
import org.apache.pig.newplan.optimizer.PlanOptimizer;
import org.apache.pig.newplan.optimizer.Rule;
import org.apache.pig.test.utils.LogicalPlanTester;
public class TestNewPlanColumnPrune extends TestCase {
LogicalPlan plan = null;
PigContext pc = new PigContext(ExecType.LOCAL, new Properties());
private LogicalPlan migratePlan(org.apache.pig.impl.logicalLayer.LogicalPlan lp) throws FrontendException{
LogicalPlanMigrationVistor visitor = new LogicalPlanMigrationVistor(lp);
visitor.visit();
org.apache.pig.newplan.logical.relational.LogicalPlan newPlan = visitor.getNewLogicalPlan();
SchemaResetter schemaResetter = new SchemaResetter(newPlan);
schemaResetter.visit();
return newPlan;
}
public void testNoPrune() throws Exception {
// no foreach
LogicalPlanTester lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("b = filter a by v1==NULL;");
org.apache.pig.impl.logicalLayer.LogicalPlan plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan newLogicalPlan = migratePlan(plan);
PlanOptimizer optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("b = filter a by v1==NULL;");
plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// no schema
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt';");
lpt.buildPlan("b = foreach a generate $0, $1;");
plan = lpt.buildPlan("store b into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt';");
lpt.buildPlan("b = foreach a generate $0, $1;");
plan = lpt.buildPlan("store b into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
}
public void testPrune() throws Exception {
// only foreach
LogicalPlanTester lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("b = foreach a generate id;");
org.apache.pig.impl.logicalLayer.LogicalPlan plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan newLogicalPlan = migratePlan(plan);
PlanOptimizer optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id);");
lpt.buildPlan("b = foreach a generate id;");
plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with filter
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = filter a by v1 != NULL AND (v2+v3)<100;");
lpt.buildPlan("c = foreach b generate id;");
plan = lpt.buildPlan("store c into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v3, v2);");
lpt.buildPlan("b = filter a by v1 != NULL AND (v2+v3)<100;");
lpt.buildPlan("c = foreach b generate id;");
plan = lpt.buildPlan("store c into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with 2 foreach
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v2, v5, v4;");
lpt.buildPlan("c = foreach b generate v5, v4;");
plan = lpt.buildPlan("store c into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (v5, v4);");
lpt.buildPlan("b = foreach a generate v5, v4;");
lpt.buildPlan("c = foreach b generate v5, v4;");
plan = lpt.buildPlan("store c into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with 2 foreach
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate id, v1, v5, v3, v4;");
lpt.buildPlan("c = foreach b generate v5, v4;");
plan = lpt.buildPlan("store c into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (v5, v4);");
lpt.buildPlan("b = foreach a generate v5, v4;");
lpt.buildPlan("c = foreach b generate v5, v4;");
plan = lpt.buildPlan("store c into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with 2 foreach and filter in between
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v2, v5, v4;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5, v4;");
plan = lpt.buildPlan("store d into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (v5, v4, v2);");
lpt.buildPlan("b = foreach a generate v2, v5, v4;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5, v4;");
plan = lpt.buildPlan("store d into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with 2 foreach after join
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2, v3);");
lpt.buildPlan("b = load 'c.txt' as (id, v4, v5, v6);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = foreach c generate a::id, v5, v3, v4;");
plan = lpt.buildPlan("store d into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v3);");
lpt.buildPlan("b = load 'c.txt' as (id, v4, v5);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = foreach c generate a::id, v5, v3, v4;");
plan = lpt.buildPlan("store d into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with BinStorage, insert foreach after load
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("c = filter a by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5, v4;");
plan = lpt.buildPlan("store d into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v5, v4, v2;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5, v4;");
plan = lpt.buildPlan("store d into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with BinStorage, not to insert foreach after load if there is already one
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v5, v4, v2;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5;");
plan = lpt.buildPlan("store d into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v5, v2;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5;");
plan = lpt.buildPlan("store d into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// with BinStorage, not to insert foreach after load if there is already one
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v5, v4, v2, 10;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5;");
plan = lpt.buildPlan("store d into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' using BinStorage() as (id, v1, v5, v3, v4, v2);");
lpt.buildPlan("b = foreach a generate v5, v2, 10;");
lpt.buildPlan("c = filter b by v2 != NULL;");
lpt.buildPlan("d = foreach c generate v5;");
plan = lpt.buildPlan("store d into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
}
@SuppressWarnings("unchecked")
public void testPruneWithMapKey() throws Exception {
// only foreach
LogicalPlanTester lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, m:map[]);");
lpt.buildPlan("b = foreach a generate id, m#'path';");
org.apache.pig.impl.logicalLayer.LogicalPlan plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan newLogicalPlan = migratePlan(plan);
PlanOptimizer optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, m:map[]);");
lpt.buildPlan("b = foreach a generate id, m#'path';");
plan = lpt.buildPlan("store b into 'empty';");
LogicalPlan expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
LOLoad op = (LOLoad)newLogicalPlan.getSources().get(0);
Map<Integer,Set<String>> annotation =
(Map<Integer, Set<String>>) op.getAnnotation(MapKeysPruneHelper.REQUIRED_MAPKEYS);
assertEquals(annotation.size(), 1);
Set<String> s = new HashSet<String>();
s.add("path");
assertEquals(annotation.get(2), s);
// foreach with join
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, m:map[]);");
lpt.buildPlan("b = load 'd.txt' as (id, v1, m:map[]);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = filter c by a::m#'path' != NULL;");
lpt.buildPlan("e = foreach d generate a::id, b::id, b::m#'path', a::m;");
plan = lpt.buildPlan("store e into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, m:map[]);");
lpt.buildPlan("b = load 'd.txt' as (id, m:map[]);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = filter c by a::m#'path' != NULL;");
lpt.buildPlan("e = foreach d generate a::id, b::id, b::m#'path', a::m;");
plan = lpt.buildPlan("store e into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
List<Operator> ll = newLogicalPlan.getSources();
assertEquals(ll.size(), 2);
LOLoad loada = null;
LOLoad loadb = null;
for(Operator opp: ll) {
if (((LogicalRelationalOperator)opp).getAlias().equals("a")) {
loada = (LOLoad)opp;
continue;
}
if (((LogicalRelationalOperator)opp).getAlias().equals("b")) {
loadb = (LOLoad)opp;
continue;
}
}
annotation =
(Map<Integer, Set<String>>) loada.getAnnotation(MapKeysPruneHelper.REQUIRED_MAPKEYS);
assertNull(annotation);
annotation =
(Map<Integer, Set<String>>) loadb.getAnnotation(MapKeysPruneHelper.REQUIRED_MAPKEYS);
assertEquals(annotation.size(), 1);
s = new HashSet<String>();
s.add("path");
assertEquals(annotation.get(2), s);
}
public void testPruneWithBag() throws Exception {
// filter above foreach
LogicalPlanTester lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v:bag{t:(s1,s2,s3)});");
lpt.buildPlan("b = filter a by id>10;");
lpt.buildPlan("c = foreach b generate id, FLATTEN(v);");
lpt.buildPlan("d = foreach c generate id, v::s2;");
org.apache.pig.impl.logicalLayer.LogicalPlan plan = lpt.buildPlan("store d into 'empty';");
LogicalPlan newLogicalPlan = migratePlan(plan);
PlanOptimizer optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v:bag{t:(s1,s2,s3)});");
lpt.buildPlan("b = filter a by id>10;");
lpt.buildPlan("c = foreach b generate id, FLATTEN(v);");
lpt.buildPlan("d = foreach c generate id, v::s2;");
plan = lpt.buildPlan("store d into 'empty';");
LogicalPlan expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
}
public void testAddForeach() throws Exception {
// filter above foreach
LogicalPlanTester lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("b = filter a by v1>10;");
lpt.buildPlan("c = foreach b generate id;");
org.apache.pig.impl.logicalLayer.LogicalPlan plan = lpt.buildPlan("store c into 'empty';");
LogicalPlan newLogicalPlan = migratePlan(plan);
PlanOptimizer optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1);");
lpt.buildPlan("b = filter a by v1>10;");
lpt.buildPlan("c = foreach b generate id;");
plan = lpt.buildPlan("store c into 'empty';");
LogicalPlan expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
// join with foreach
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("b = load 'd.txt' as (id, v1, v2);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = filter c by a::v1>b::v1;");
lpt.buildPlan("e = foreach d generate a::id;");
plan = lpt.buildPlan("store e into 'empty';");
newLogicalPlan = migratePlan(plan);
optimizer = new MyPlanOptimizer(newLogicalPlan, 3);
optimizer.optimize();
lpt = new LogicalPlanTester(pc);
lpt.buildPlan("a = load 'd.txt' as (id, v1);");
lpt.buildPlan("b = load 'd.txt' as (id, v1);");
lpt.buildPlan("c = join a by id, b by id;");
lpt.buildPlan("d = foreach c generate a::id, a::v1, b::v1;");
lpt.buildPlan("e = filter d by a::v1>b::v1;");
lpt.buildPlan("f = foreach e generate a::id;");
plan = lpt.buildPlan("store f into 'empty';");
expected = migratePlan(plan);
assertTrue(expected.isEqual(newLogicalPlan));
}
public class MyPlanOptimizer extends LogicalPlanOptimizer {
protected MyPlanOptimizer(OperatorPlan p, int iterations) {
super(p, iterations, null);
}
protected List<Set<Rule>> buildRuleSets() {
List<Set<Rule>> ls = new ArrayList<Set<Rule>>();
Rule r = new ColumnMapKeyPrune("ColumnMapKeyPrune");
Set<Rule> s = new HashSet<Rule>();
s.add(r);
ls.add(s);
r = new AddForEach("AddForEach");
s = new HashSet<Rule>();
s.add(r);
ls.add(s);
return ls;
}
}
}