/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.*; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import java.io.Serializable; import java.util.Arrays; import java.util.List; import java.util.Properties; import static org.junit.Assert.*; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.reset; import static org.mockito.Mockito.when; public class TestGenMapRedUtilsCreateConditionalTask { private static HiveConf hiveConf; private Task dummyMRTask; @BeforeClass public static void initializeSessionState() { hiveConf = new HiveConf(); } @Before public void setUp() { dummyMRTask = new MapRedTask(); } @Test public void testMovePathsThatCannotBeMerged() { final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000"); final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002"); final MoveWork mockWork = mock(MoveWork.class); assertFalse("A MoveWork null object cannot be merged.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, null)); hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false"); assertFalse("Merging paths is not allowed when BlobStorage optimizations are disabled.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); // Enable BlobStore optimizations for the rest of tests hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true"); reset(mockWork); when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc()); assertFalse("Merging paths is not allowed when MultiFileWork is found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); reset(mockWork); when(mockWork.getLoadFileWork()).thenReturn(mock(LoadFileDesc.class)); when(mockWork.getLoadTableWork()).thenReturn(mock(LoadTableDesc.class)); assertFalse("Merging paths is not allowed when both LoadFileWork & LoadTableWork are found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); reset(mockWork); when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condInputPath, condOutputPath, false, "", "")); assertFalse("Merging paths is not allowed when both conditional output path is not equals to MoveWork input path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); reset(mockWork); when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("unused"), false, "", "")); assertFalse("Merging paths is not allowed when conditional input path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, new Path("hdfs://hdfs-path"), condOutputPath, mockWork)); reset(mockWork); when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("hdfs://hdfs-path"), false, "", "")); assertFalse("Merging paths is not allowed when MoveWork output path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); } @Test public void testMovePathsThatCanBeMerged() { final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000"); final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002"); final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003"); final MoveWork mockWork = mock(MoveWork.class); when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "")); assertTrue("Merging BlobStore paths should be allowed.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork)); } @Test(expected = IllegalArgumentException.class) public void testMergePathWithInvalidMoveWorkThrowsException() { final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000"); final MoveWork mockWork = mock(MoveWork.class); when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc()); GenMapRedUtils.mergeMovePaths(condInputPath, mockWork); } @Test public void testMergePathValidMoveWorkReturnsNewMoveWork() { final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000"); final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002"); final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003"); final MoveWork mockWork = mock(MoveWork.class); MoveWork newWork; // test using loadFileWork when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "")); newWork = GenMapRedUtils.mergeMovePaths(condInputPath, mockWork); assertNotNull(newWork); assertNotEquals(newWork, mockWork); assertEquals(condInputPath, newWork.getLoadFileWork().getSourcePath()); assertEquals(targetMoveWorkPath, newWork.getLoadFileWork().getTargetDir()); // test using loadTableWork TableDesc tableDesc = new TableDesc(); reset(mockWork); when(mockWork.getLoadTableWork()).thenReturn(new LoadTableDesc(condOutputPath, tableDesc, null)); newWork = GenMapRedUtils.mergeMovePaths(condInputPath, mockWork); assertNotNull(newWork); assertNotEquals(newWork, mockWork); assertEquals(condInputPath, newWork.getLoadTableWork().getSourcePath()); assertTrue(newWork.getLoadTableWork().getTable().equals(tableDesc)); } @Test public void testConditionalMoveTaskIsOptimized() throws SemanticException { hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true"); Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002"); FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName); Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000"); Path tableLocation = new Path("s3a://bucket/warehouse/table"); Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation); List<Task<MoveWork>> moveTaskList = Arrays.asList(moveTask); GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask); ConditionalTask conditionalTask = (ConditionalTask)dummyMRTask.getChildTasks().get(0); Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0); Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1); Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2); /* * OPTIMIZATION * The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of * linking, it creates one MoveTask where the source is the first MoveTask source, and target is the * second MoveTask target. */ // Verify moveOnlyTask is optimized assertNull(moveOnlyTask.getChildTasks()); verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation); // Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed) assertEquals(1, mergeOnlyTask.getChildTasks().size()); verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation); // Verify mergeAndMoveTask is NOT optimized assertEquals(1, mergeAndMoveTask.getChildTasks().size()); assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size()); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation); } @Test public void testConditionalMoveTaskIsNotOptimized() throws SemanticException { hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false"); Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002"); FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName); Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000"); Path tableLocation = new Path("s3a://bucket/warehouse/table"); Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation); List<Task<MoveWork>> moveTaskList = Arrays.asList(moveTask); GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask); ConditionalTask conditionalTask = (ConditionalTask)dummyMRTask.getChildTasks().get(0); Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0); Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1); Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2); // Verify moveOnlyTask is NOT optimized assertEquals(1, moveOnlyTask.getChildTasks().size()); verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName); verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation); // Verify mergeOnlyTask is NOT optimized assertEquals(1, mergeOnlyTask.getChildTasks().size()); verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation); // Verify mergeAndMoveTask is NOT optimized assertEquals(1, mergeAndMoveTask.getChildTasks().size()); assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size()); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation); } @Test public void testConditionalMoveOnHdfsIsNotOptimized() throws SemanticException { hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true"); Path sinkDirName = new Path("hdfs://bucket/scratch/-ext-10002"); FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName); Path finalDirName = new Path("hdfs://bucket/scratch/-ext-10000"); Path tableLocation = new Path("hdfs://bucket/warehouse/table"); Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation); List<Task<MoveWork>> moveTaskList = Arrays.asList(moveTask); GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask); ConditionalTask conditionalTask = (ConditionalTask)dummyMRTask.getChildTasks().get(0); Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0); Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1); Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2); // Verify moveOnlyTask is NOT optimized assertEquals(1, moveOnlyTask.getChildTasks().size()); verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName); verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation); // Verify mergeOnlyTask is NOT optimized assertEquals(1, mergeOnlyTask.getChildTasks().size()); verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation); // Verify mergeAndMoveTask is NOT optimized assertEquals(1, mergeAndMoveTask.getChildTasks().size()); assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size()); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName); verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation); } private FileSinkOperator createFileSinkOperator(Path finalDirName) { FileSinkOperator fileSinkOperator = mock(FileSinkOperator.class); TableDesc tableDesc = new TableDesc(HiveInputFormat.class, HiveOutputFormat.class, new Properties()); FileSinkDesc fileSinkDesc = new FileSinkDesc(finalDirName, tableDesc, false); fileSinkDesc.setDirName(finalDirName); when(fileSinkOperator.getConf()).thenReturn(fileSinkDesc); when(fileSinkOperator.getSchema()).thenReturn(mock(RowSchema.class)); fileSinkDesc.setTableInfo(tableDesc); when(fileSinkOperator.getCompilationOpContext()).thenReturn(mock(CompilationOpContext.class)); return fileSinkOperator; } private Task<MoveWork> createMoveTask(Path source, Path destination) { Task<MoveWork> moveTask = mock(MoveTask.class); MoveWork moveWork = new MoveWork(); moveWork.setLoadFileWork(new LoadFileDesc(source, destination, true, null, null)); when(moveTask.getWork()).thenReturn(moveWork); return moveTask; } private void verifyMoveTask(Task<? extends Serializable> task, Path source, Path target) { MoveTask moveTask = (MoveTask)task; assertEquals(source, moveTask.getWork().getLoadFileWork().getSourcePath()); assertEquals(target, moveTask.getWork().getLoadFileWork().getTargetDir()); } }