/* * Copyright 2015 herd contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.finra.herd.service.helper; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.junit.Assert; import org.junit.Test; import org.finra.herd.model.api.xml.BusinessObjectDataKey; import org.finra.herd.model.api.xml.BusinessObjectFormatDdlRequest; import org.finra.herd.model.api.xml.SchemaColumn; import org.finra.herd.model.dto.HivePartitionDto; import org.finra.herd.model.jpa.BusinessObjectDataEntity; import org.finra.herd.model.jpa.BusinessObjectFormatEntity; import org.finra.herd.model.jpa.SchemaColumnEntity; import org.finra.herd.service.AbstractServiceTest; /** * This class tests functionality within the Hive13DdlGenerator class. */ public class Hive13DdlGeneratorTest extends AbstractServiceTest { @Test public void testGetHivePartitions() { // Create a test business object data entity. BusinessObjectDataEntity businessObjectDataEntity = businessObjectDataDaoTestHelper .createBusinessObjectDataEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, PARTITION_VALUE, DATA_VERSION, true, BDATA_STATUS); List<SchemaColumn> autoDiscoverableSubPartitionColumns; List<String> storageFilePaths; List<HivePartitionDto> expectedHivePartitions; List<HivePartitionDto> resultHivePartitions; // Get business object data key. BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(businessObjectDataEntity); // No storage files. autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2")); storageFilePaths = new ArrayList<>(); expectedHivePartitions = new ArrayList<>(); resultHivePartitions = hive13DdlGenerator .getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME); assertEquals(expectedHivePartitions, resultHivePartitions); // Single level partitioning. autoDiscoverableSubPartitionColumns = new ArrayList<>(); storageFilePaths = getStorageFilePaths(Arrays.asList("/file1.dat", "/file2.dat")); expectedHivePartitions = Arrays.asList(HivePartitionDto.builder().path("").partitionValues(Arrays.asList(PARTITION_VALUE)).build()); resultHivePartitions = hive13DdlGenerator .getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME); assertEquals(expectedHivePartitions, resultHivePartitions); // Test that we match column names in storage file paths ignoring the case. autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2")); storageFilePaths = getStorageFilePaths(Arrays.asList("/COLUMN1=111/COLUMN2=222/file.dat", "/column1=aa/column2=bb/")); expectedHivePartitions = Arrays .asList(HivePartitionDto.builder().path("/COLUMN1=111/COLUMN2=222").partitionValues(Arrays.asList(PARTITION_VALUE, "111", "222")).build(), HivePartitionDto.builder().path("/column1=aa/column2=bb").partitionValues(Arrays.asList(PARTITION_VALUE, "aa", "bb")).build()); resultHivePartitions = hive13DdlGenerator .getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME); assertEquals(expectedHivePartitions, resultHivePartitions); } @Test public void testGetHivePartitionsPatternMismatch() { // Create a test business object data entity. BusinessObjectDataEntity businessObjectDataEntity = businessObjectDataDaoTestHelper .createBusinessObjectDataEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, PARTITION_VALUE, DATA_VERSION, true, BDATA_STATUS); List<SchemaColumn> autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2")); String pattern = hive13DdlGenerator.getHivePathPattern(autoDiscoverableSubPartitionColumns).pattern(); List<String> badFilePaths = Arrays.asList("/column1=a/column2=b/extra-folder/file.dat", // extra folder "/column2=a/column1=b/file.dat", // partition columns out of order "/column1=a/file.dat", // missing partition sub-directory "/column1=a/column2=/file.dat", // missing partition value "/column1=a/column2/file.dat", // missing partition value "/column1=a/a/column2=2/file.dat", // slash in a partition value "/column1=a/column2=2" // missing trailing '/' character ); // Get business object data key. BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(businessObjectDataEntity); for (int i = 0; i < badFilePaths.size(); i++) { List<String> storageFilePaths = getStorageFilePaths(badFilePaths.subList(i, i + 1)); try { hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME); fail("Should throw an IllegalArgumentException when storage file does not match the expected Hive sub-directory pattern."); } catch (IllegalArgumentException e) { assertEquals(String.format("Registered storage file or directory does not match the expected Hive sub-directory pattern. " + "Storage: {%s}, file/directory: {%s}, business object data: {%s}, S3 key prefix: {%s}, pattern: {^%s$}", STORAGE_NAME, storageFilePaths.get(0), businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), TEST_S3_KEY_PREFIX, pattern), e.getMessage()); } } } @Test public void testGetHivePartitionsMultiplePathsFound() { // Create a test business object data entity. BusinessObjectDataEntity businessObjectDataEntity = businessObjectDataDaoTestHelper .createBusinessObjectDataEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, PARTITION_VALUE, DATA_VERSION, true, BDATA_STATUS); List<SchemaColumn> autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2")); List<String> partitionPaths = Arrays.asList("/COLUMN1=111/COLUMN2=222", "/column1=111/COLUMN2=222"); List<String> storageFilePaths = getStorageFilePaths(Arrays.asList(partitionPaths.get(0) + "/file.dat", partitionPaths.get(1) + "/file.dat")); try { hive13DdlGenerator .getHivePartitions(businessObjectDataHelper.getBusinessObjectDataKey(businessObjectDataEntity), autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME); fail("Should throw an IllegalArgumentException when multiple locations exist for the same Hive partition."); } catch (IllegalArgumentException e) { assertEquals(String.format("Found two different locations for the same Hive partition. " + "Storage: {%s}, business object data: {%s}, S3 key prefix: {%s}, path[1]: {%s}, path[2]: {%s}", STORAGE_NAME, businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), TEST_S3_KEY_PREFIX, partitionPaths.get(0), partitionPaths.get(1)), e.getMessage()); } } @Test public void testGetDdlCharacterValueEmptyString() { assertEquals("", hive13DdlGenerator.getDdlCharacterValue("")); } @Test public void testGetDdlCharacterValueLineFeed() { // Linefeed character is an ASCII octal \012 (decimal 10) which gets escaped as part of the DDL generation. assertEquals("\\012", hive13DdlGenerator.getDdlCharacterValue("\n")); } @Test public void testGetDdlCharacterValueAsciiPrintable() { assertEquals("|", hive13DdlGenerator.getDdlCharacterValue("|")); } @Test public void testGetDdlCharacterValueAsciiNonAsciiMix() { // It makes no sense to output a single non-ASCII printable character in the middle of other ASCII printable characters, // but that's what we do if a user actually specifies this. We might want to add more validation to prevent this scenario in the future // since Hive shouldn't allow this. Please note that decimal 128 is 200 in octal. assertEquals("A\\200B", hive13DdlGenerator.getDdlCharacterValue("A" + String.valueOf((char) 128) + "B")); } @Test public void testGetDdlCharacterValueTwoNonAsciiPrintableChars() { // Decimal 128 is 200 in octal. assertEquals("\\200\\001", hive13DdlGenerator.getDdlCharacterValue(String.valueOf((char) 128) + String.valueOf((char) 1))); } @Test public void testGetDdlCharacterValueEscapeSingleBackslash() { assertEquals("\\", hive13DdlGenerator.getDdlCharacterValue("\\")); assertEquals("\\", hive13DdlGenerator.getDdlCharacterValue("\\", false)); assertEquals("\\\\", hive13DdlGenerator.getDdlCharacterValue("\\", true)); assertEquals("\\\\", hive13DdlGenerator.getDdlCharacterValue("\\\\", true)); } @Test public void testEscapeSingleQuotes() { // Create a test vector with key=input and value=output values. LinkedHashMap<String, String> testVector = new LinkedHashMap<>(); testVector.put("some text without single quotes", "some text without single quotes"); testVector.put("'some \\'text\\' with single 'quotes'", "\\'some \\'text\\' with single \\'quotes\\'"); testVector.put("'", "\\'"); testVector.put("''''", "\\'\\'\\'\\'"); testVector.put("'", "\\'"); testVector.put("'\'\\'", "\\'\\'\\'"); // Loop over all entries in the test vector. for (Object set : testVector.entrySet()) { Map.Entry<?, ?> entry = (Map.Entry<?, ?>) set; assertEquals(entry.getValue(), hive13DdlGenerator.escapeSingleQuotes((String) entry.getKey())); } } /** * Asserts that generateReplaceColumnsStatement() generates the correct DDL statement. */ @Test public void testGenerateReplaceColumnsStatement() { BusinessObjectFormatDdlRequest businessObjectFormatDdlRequest = new BusinessObjectFormatDdlRequest(); businessObjectFormatDdlRequest.setTableName(TABLE_NAME); BusinessObjectFormatEntity businessObjectFormatEntity = businessObjectFormatDaoTestHelper .createBusinessObjectFormatEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, FORMAT_DESCRIPTION, true, PARTITION_KEY); { SchemaColumnEntity schemaColumnEntity = new SchemaColumnEntity(); schemaColumnEntity.setPosition(0); schemaColumnEntity.setName("col1"); schemaColumnEntity.setType("varchar"); schemaColumnEntity.setSize("255"); schemaColumnEntity.setDescription("lorem ipsum"); businessObjectFormatEntity.getSchemaColumns().add(schemaColumnEntity); } { SchemaColumnEntity schemaColumnEntity = new SchemaColumnEntity(); schemaColumnEntity.setPosition(1); schemaColumnEntity.setName("col2"); schemaColumnEntity.setType("date"); businessObjectFormatEntity.getSchemaColumns().add(schemaColumnEntity); } String actual = hive13DdlGenerator.generateReplaceColumnsStatement(businessObjectFormatDdlRequest, businessObjectFormatEntity); String expected = "ALTER TABLE `" + businessObjectFormatDdlRequest.getTableName() + "` REPLACE COLUMNS (\n" + " `col1` VARCHAR(255) COMMENT 'lorem ipsum',\n" + " `col2` DATE);"; Assert.assertEquals("generated DDL", expected, actual); } /** * Asserts that generateReplaceColumnsStatement() throws an IllegalArgumentException when the format entity only specified partitions, but no columns. */ @Test public void testGenerateReplaceColumnsStatementAssertionErrorIfColumnsEmpty() { BusinessObjectFormatDdlRequest businessObjectFormatDdlRequest = new BusinessObjectFormatDdlRequest(); businessObjectFormatDdlRequest.setTableName(TABLE_NAME); BusinessObjectFormatEntity businessObjectFormatEntity = businessObjectFormatDaoTestHelper .createBusinessObjectFormatEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, FORMAT_DESCRIPTION, true, PARTITION_KEY); { SchemaColumnEntity schemaColumnEntity = new SchemaColumnEntity(); schemaColumnEntity.setPartitionLevel(0); schemaColumnEntity.setName("col1"); schemaColumnEntity.setType("date"); businessObjectFormatEntity.getSchemaColumns().add(schemaColumnEntity); } try { hive13DdlGenerator.generateReplaceColumnsStatement(businessObjectFormatDdlRequest, businessObjectFormatEntity); Assert.fail("expected IllegalArgumentException, but no exception was thrown"); } catch (Exception e) { Assert.assertEquals("thrown exception type", IllegalArgumentException.class, e.getClass()); Assert.assertEquals("thrown exception message", "No schema columns specified for business object format {namespace: \"" + NAMESPACE + "\", businessObjectDefinitionName: \"" + BDEF_NAME + "\", businessObjectFormatUsage: \"" + FORMAT_USAGE_CODE + "\", businessObjectFormatFileType: \"" + FORMAT_FILE_TYPE_CODE + "\", businessObjectFormatVersion: " + FORMAT_VERSION + "}.", e.getMessage()); } } private List<SchemaColumn> getPartitionColumns(List<String> partitionColumnNames) { List<SchemaColumn> schemaColumns = new ArrayList<>(); for (String partitionColumnName : partitionColumnNames) { SchemaColumn schemaColumn = new SchemaColumn(); schemaColumns.add(schemaColumn); schemaColumn.setName(partitionColumnName); } return schemaColumns; } private List<String> getStorageFilePaths(List<String> storageFileRelativePaths) { List<String> storageFilePaths = new ArrayList<>(); for (String storageFileRelativePath : storageFileRelativePaths) { storageFilePaths.add(String.format("%s%s", TEST_S3_KEY_PREFIX, storageFileRelativePath)); } return storageFilePaths; } }