/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.test; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Properties; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.pig.ExecType; import org.apache.pig.Expression; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.Tuple; import org.apache.pig.newplan.logical.LogicalPlanMigrationVistor; import org.apache.pig.newplan.logical.expression.LogicalExpression; import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer; import org.apache.pig.newplan.logical.relational.LOFilter; import org.apache.pig.newplan.logical.relational.LOLoad; import org.apache.pig.newplan.logical.relational.LogicalPlan; import org.apache.pig.newplan.logical.rules.PartitionFilterOptimizer; import org.apache.pig.newplan.logical.rules.LoadTypeCastInserter; import org.apache.pig.newplan.Operator; import org.apache.pig.newplan.OperatorPlan; import org.apache.pig.newplan.PColFilterExtractor; import org.apache.pig.newplan.optimizer.PlanOptimizer; import org.apache.pig.newplan.optimizer.Rule; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.logicalLayer.PlanSetter; import org.apache.pig.impl.logicalLayer.parser.ParseException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.LogUtils; import org.apache.pig.test.utils.LogicalPlanTester; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; /** * unit tests to test extracting partition filter conditions out of the filter * condition in the filter following a load which talks to metadata system (.i.e. * implements {@link LoadMetadata}) */ public class TestPartitionFilterPushDown { static PigContext pc = new PigContext(ExecType.LOCAL, new Properties()); static LogicalPlanTester lpTester; @BeforeClass public static void setup() throws Exception { lpTester = new LogicalPlanTester(pc); lpTester.buildPlan("a = load 'foo' as (srcid, mrkt, dstid, name, age);"); } @AfterClass public static void tearDown() { } /** * test case where there is a single expression on partition columns in * the filter expression along with an expression on non partition column * @throws IOException */ @Test public void testSimpleMixed() throws IOException { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by srcid == 10 and name == 'foo';"); test(lp, Arrays.asList("srcid"), "(srcid == 10)", "(name == 'foo')"); } /** * test case where filter does not contain any condition on partition cols * @throws Exception */ @Test public void testNoPartFilter() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by age == 20 and name == 'foo';"); test(lp, Arrays.asList("srcid"), null, "((age == 20) and (name == 'foo'))"); } /** * test case where filter only contains condition on partition cols * @throws Exception */ @Test public void testOnlyPartFilter1() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by srcid > 20 and mrkt == 'us';"); test(lp, Arrays.asList("srcid", "mrkt"), "((srcid > 20) and (mrkt == 'us'))", null); } /** * test case where filter only contains condition on partition cols * @throws Exception */ @Test public void testOnlyPartFilter2() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by mrkt == 'us';"); test(lp, Arrays.asList("srcid", "mrkt"), "(mrkt == 'us')", null); } /** * test case where filter only contains condition on partition cols * @throws Exception */ @Test public void testOnlyPartFilter3() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by srcid == 20 or mrkt == 'us';"); test(lp, Arrays.asList("srcid", "mrkt"), "((srcid == 20) or (mrkt == 'us'))", null); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns */ @Test public void testMixed1() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(age < 20 and mrkt == 'us') and (srcid == 10 and " + "name == 'foo');"); test(lp, Arrays.asList("srcid", "mrkt"), "((mrkt == 'us') and (srcid == 10))", "((age < 20) and (name == 'foo'))"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns */ @Test public void testMixed2() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(age >= 20 and mrkt == 'us') and (srcid == 10 and " + "dstid == 15);"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "((mrkt == 'us') and ((srcid == 10) and (dstid == 15)))", "(age >= 20)"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns */ @Test public void testMixed3() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "age >= 20 and mrkt == 'us' and srcid == 10;"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "((mrkt == 'us') and (srcid == 10))", "(age >= 20)"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns - this testcase also has a condition * based on comparison of two partition columns */ @Test public void testMixed4() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "age >= 20 and mrkt == 'us' and name == 'foo' and " + "srcid == dstid;"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "((mrkt == 'us') and (srcid == dstid))", "((age >= 20) and (name == 'foo'))"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns - * This testcase has two partition col conditions with OR + non parition * col conditions */ @Test public void testMixed5() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(srcid == 10 or mrkt == 'us') and name == 'foo' and " + "dstid == 30;"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "(((srcid == 10) or (mrkt == 'us')) and (dstid == 30))", "(name == 'foo')"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns - * This testcase has two partition col conditions with OR + non parition * col conditions */ @Test public void testMixed6() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "dstid == 30 and (srcid == 10 or mrkt == 'us') and name == 'foo';"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "((dstid == 30) and ((srcid == 10) or (mrkt == 'us')))", "(name == 'foo')"); } /** * test case where filter has both conditions on partition cols and non * partition cols and the filter condition will be split to extract the * conditions on partition columns. This testcase also tests arithmetic * in partition column conditions */ @Test public void testMixedArith() throws Exception { org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "mrkt == 'us' and srcid * 10 == 150 + 20 and age != 15;"); test(lp, Arrays.asList("srcid", "dstid", "mrkt"), "((mrkt == 'us') and ((srcid * 10) == (150 + 20)))", "(age != 15)"); } @Test public void testNegPColConditionWithNonPCol() throws Exception { // use of partition column condition and non partition column in // same condition should fail org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "srcid > age;"); negativeTest(lp, Arrays.asList("srcid"), 1111); lp = lpTester.buildPlan("b = filter a by " + "srcid + age == 20;"); negativeTest(lp, Arrays.asList("srcid"), 1111); // OR of partition column condition and non partiton col condition // should fail lp = lpTester.buildPlan("b = filter a by " + "srcid > 10 or name == 'foo';"); negativeTest(lp, Arrays.asList("srcid"), 1111); } @Test public void testNegPColInWrongPlaces() throws Exception { int expectedErrCode = 1112; org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(srcid > 10 and name == 'foo') or dstid == 10;"); negativeTest(lp, Arrays.asList("srcid", "dstid"), expectedErrCode); expectedErrCode = 1110; lp = lpTester.buildPlan("b = filter a by " + "CONCAT(mrkt, '_10') == 'US_10' and age == 20;"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"), expectedErrCode); lp = lpTester.buildPlan("b = filter a by " + "mrkt matches '.*us.*' and age < 15;"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"), expectedErrCode); lp = lpTester.buildPlan("b = filter a by " + "(int)mrkt == 10 and name matches '.*foo.*';"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"),expectedErrCode); lp = lpTester.buildPlan("b = filter a by " + "(mrkt == 'us' ? age : age + 10) == 40 and name matches '.*foo.*';"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"), expectedErrCode); lp = lpTester.buildPlan("b = filter a by " + "(mrkt is null) and name matches '.*foo.*';"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"), expectedErrCode); lp = lpTester.buildPlan("b = filter a by " + "(mrkt is not null) and name matches '.*foo.*';"); negativeTest(lp, Arrays.asList("srcid", "dstid", "mrkt"), expectedErrCode); } /** * Test that pig sends correct partition column names in setPartitionFilter * when the user has a schema in the load statement which renames partition * columns * @throws Exception */ @Test public void testColNameMapping1() throws Exception { TestLoader.partFilter = null; lpTester.buildPlan("a = load 'foo' using " + TestLoader.class.getName() + "('srcid:int, mrkt:chararray, dstid:int, name:chararray, age:int', " + "'srcid,mrkt') as (f1, f2, f3, f4, f5);"); org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(f5 >= 20 and f2 == 'us') and (f1 == 10 and f3 == 15);"); LogicalPlan newLogicalPlan = migrateAndOptimizePlan( lp ); Assert.assertEquals("checking partition filter:", "((mrkt == 'us') and (srcid == 10))", TestLoader.partFilter.toString()); LOFilter filter = (LOFilter)newLogicalPlan.getSinks().get(0); String actual = PColFilterExtractor.getExpression( (LogicalExpression)filter.getFilterPlan().getSources().get(0)). toString().toLowerCase(); Assert.assertEquals("checking trimmed filter expression:", "((f5 >= 20) and (f3 == 15))", actual); } private LogicalPlan migrateAndOptimizePlan(org.apache.pig.impl.logicalLayer.LogicalPlan plan) throws IOException { LogicalPlan newLogicalPlan = migratePlan( plan ); PlanOptimizer optimizer = new MyPlanOptimizer( newLogicalPlan, 3 ); optimizer.optimize(); return newLogicalPlan; } /** * Test that pig sends correct partition column names in setPartitionFilter * when the user has a schema in the load statement which renames partition * columns - in this test case there is no condition on partition columns * - so setPartitionFilter() should not be called and the filter condition * should remain as is. * @throws Exception */ @Test public void testColNameMapping2() throws Exception { TestLoader.partFilter = null; lpTester.buildPlan("a = load 'foo' using " + TestLoader.class.getName() + "('srcid:int, mrkt:chararray, dstid:int, name:chararray, age:int', " + "'srcid') as (f1, f2, f3, f4, f5);"); org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "f5 >= 20 and f2 == 'us' and f3 == 15;"); LogicalPlan newLogicalPlan = migrateAndOptimizePlan( lp ); Assert.assertEquals("checking partition filter:", null, TestLoader.partFilter); LOFilter filter = (LOFilter) newLogicalPlan.getSinks().get(0); String actual = PColFilterExtractor.getExpression( (LogicalExpression) filter.getFilterPlan(). getSources().get(0)). toString().toLowerCase(); Assert.assertEquals("checking trimmed filter expression:", "(((f5 >= 20) and (f2 == 'us')) and (f3 == 15))", actual); } /** * Test that pig sends correct partition column names in setPartitionFilter * when the user has a schema in the load statement which renames partition * columns - in this test case the filter only has conditions on partition * columns * @throws Exception */ @Test public void testColNameMapping3() throws Exception { TestLoader.partFilter = null; lpTester.buildPlan("a = load 'foo' using " + TestLoader.class.getName() + "('srcid:int, mrkt:chararray, dstid:int, name:chararray, age:int', " + "'srcid,mrkt,dstid,age') as (f1, f2, f3, f4, f5);"); org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(f5 >= 20 or f2 == 'us') and (f1 == 10 and f3 == 15);"); LogicalPlan newLogicalPlan = migrateAndOptimizePlan( lp ); Assert.assertEquals("checking partition filter:", "(((age >= 20) or (mrkt == 'us')) and ((srcid == 10) and " + "(dstid == 15)))", TestLoader.partFilter.toString()); Iterator<Operator> it = newLogicalPlan.getOperators(); Assert.assertTrue("Checking that filter has been removed since it contained" + " only conditions on partition cols:", (it.next() instanceof LOLoad)); Assert.assertFalse("Checking that filter has been removed since it contained" + " only conditions on partition cols:", it.hasNext()); } /** * Test that pig sends correct partition column names in setPartitionFilter * when the user has a schema in the load statement which renames partition * columns - in this test case the schema in load statement is a prefix * (with columns renamed) of the schema returned by * {@link LoadMetadata#getSchema(String, Configuration)} * @throws Exception */ @Test public void testColNameMapping4() throws Exception { TestLoader.partFilter = null; lpTester.buildPlan("a = load 'foo' using " + TestLoader.class.getName() + "('srcid:int, mrkt:chararray, dstid:int, name:chararray, age:int', " + "'srcid,mrkt') as (f1, f2, f3);"); org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester.buildPlan("b = filter a by " + "(age >= 20 and f2 == 'us') and (f1 == 10 and f3 == 15);"); LogicalPlan newLogicalPlan = migrateAndOptimizePlan( lp ); Assert.assertEquals("checking partition filter:", "((mrkt == 'us') and (srcid == 10))", TestLoader.partFilter.toString()); LOFilter filter = (LOFilter) newLogicalPlan.getSinks().get(0); String actual = PColFilterExtractor.getExpression( (LogicalExpression) filter.getFilterPlan().getSources().get(0)). toString().toLowerCase(); Assert.assertEquals("checking trimmed filter expression:", "((age >= 20) and (f3 == 15))", actual); } /** * Test PIG-1267 * @throws Exception */ @Test public void testColNameMapping5() throws Exception { TestLoader.partFilter = null; lpTester.buildPlan("a = load 'foo' using " + TestLoader.class.getName() + "('mrkt:chararray, a1:chararray, a2:chararray, srcid:int, bcookie:chararray', " + "'srcid');"); lpTester.buildPlan("b = load 'bar' using " + TestLoader.class.getName() + "('dstid:int, b1:int, b2:int, srcid:int, bcookie:chararray, mrkt:chararray'," + "'srcid');"); lpTester.buildPlan("a1 = filter a by srcid == 10;"); lpTester.buildPlan("b1 = filter b by srcid == 20;"); lpTester.buildPlan("c = join a1 by bcookie, b1 by bcookie;"); org.apache.pig.impl.logicalLayer.LogicalPlan lp = lpTester .buildPlan("d = foreach c generate $4 as bcookie:chararray, " + "$5 as dstid:int, $0 as mrkt:chararray;"); new PlanSetter(lp).visit(); LogicalPlan newLogicalPlan = migrateAndOptimizePlan( lp ); String partFilter = TestLoader.partFilter.toString(); Assert.assertTrue( "(srcid == 20)".equals( partFilter ) || "(srcid == 10)".equals( partFilter ) ); int counter = 0; Iterator<Operator> iter = newLogicalPlan.getOperators(); while (iter.hasNext()) { Assert.assertTrue(!(iter.next() instanceof LOFilter)); counter++; } Assert.assertEquals(counter, 4); } //// helper methods /////// private PColFilterExtractor test(org.apache.pig.impl.logicalLayer.LogicalPlan lp, List<String> partitionCols, String expPartFilterString, String expFilterString) throws IOException { LogicalPlan newLogicalPlan = migratePlan( lp ); LOFilter filter = (LOFilter)newLogicalPlan.getSinks().get(0); PColFilterExtractor pColExtractor = new PColFilterExtractor( filter.getFilterPlan(), partitionCols); pColExtractor.visit(); if(expPartFilterString == null) { Assert.assertEquals("Checking partition column filter:", null, pColExtractor.getPColCondition()); } else { Assert.assertEquals("Checking partition column filter:", expPartFilterString.toLowerCase(), pColExtractor.getPColCondition().toString().toLowerCase()); } if(expFilterString == null) { Assert.assertTrue("Check that filter can be removed:", pColExtractor.isFilterRemovable()); } else { String actual = PColFilterExtractor.getExpression( (LogicalExpression)filter.getFilterPlan().getSources().get(0)). toString().toLowerCase(); Assert.assertEquals("checking trimmed filter expression:", expFilterString, actual); } return pColExtractor; } private void negativeTest(org.apache.pig.impl.logicalLayer.LogicalPlan lp, List<String> partitionCols, int expectedErrorCode) throws VisitorException { LogicalPlan newLogicalPlan = migratePlan( lp ); LOFilter filter = (LOFilter)newLogicalPlan.getSinks().get(0); PColFilterExtractor pColExtractor = new PColFilterExtractor( filter.getFilterPlan(), partitionCols); try { pColExtractor.visit(); } catch(Exception e) { Assert.assertEquals("Checking if exception has right error code", expectedErrorCode, LogUtils.getPigException(e).getErrorCode()); return; } Assert.fail("Exception expected!"); } /** * this loader is only used to test that parition column filters are given * in the manner expected in terms of column names - hence it does not * implement many of the methods and only implements required ones. */ public static class TestLoader extends LoadFunc implements LoadMetadata { Schema schema; String[] partCols; static Expression partFilter = null; public TestLoader(String schemaString, String commaSepPartitionCols) throws ParseException { schema = Util.getSchemaFromString(schemaString); partCols = commaSepPartitionCols.split(","); } @Override public InputFormat getInputFormat() throws IOException { return null; } @Override public Tuple getNext() throws IOException { return null; } @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { } @Override public void setLocation(String location, Job job) throws IOException { } @Override public String[] getPartitionKeys(String location, Job job) throws IOException { return partCols; } @Override public ResourceSchema getSchema(String location, Job job) throws IOException { return new ResourceSchema(schema); } @Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { return null; } @Override public void setPartitionFilter(Expression partitionFilter) throws IOException { partFilter = partitionFilter; } } public class MyPlanOptimizer extends LogicalPlanOptimizer { protected MyPlanOptimizer(OperatorPlan p, int iterations) { super( p, iterations, new HashSet<String>() ); } protected List<Set<Rule>> buildRuleSets() { List<Set<Rule>> ls = new ArrayList<Set<Rule>>(); Set<Rule> s = new HashSet<Rule>(); // add split filter rule Rule r = new PartitionFilterOptimizer("PartitionFilterPushDown"); s = new HashSet<Rule>(); s.add(r); ls.add(s); r = new LoadTypeCastInserter( "LoadTypeCastInserter" ); s = new HashSet<Rule>(); s.add(r); ls.add(s); return ls; } } private LogicalPlan migratePlan(org.apache.pig.impl.logicalLayer.LogicalPlan lp) throws VisitorException{ LogicalPlanMigrationVistor visitor = new LogicalPlanMigrationVistor(lp); visitor.visit(); LogicalPlan newPlan = visitor.getNewLogicalPlan(); return newPlan; } }