/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.cascading.assembly;
import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop2.Hadoop2MR1FlowConnector;
import hydrograph.engine.cascading.assembly.InputFileHiveTextAssembly;
import hydrograph.engine.cascading.assembly.infra.ComponentParameters;
import hydrograph.engine.core.component.entity.InputFileHiveTextEntity;
import hydrograph.engine.core.component.entity.elements.OutSocket;
import hydrograph.engine.core.component.entity.elements.SchemaField;
import hydrograph.engine.utilites.AssemblyBuildHelper;
import hydrograph.engine.utilites.CascadingTestCase;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
public class InputFileHiveTextFileTest {
@Test
public void itShouldTestHiveTextInputFileAssembly() throws IOException {
String databaseName = "devl_fraud";
String tableName = "tsimple";
String outPath = "../elt-command-line/testData/Output/outputFromHiveParquet";
InputFileHiveTextEntity entity = new InputFileHiveTextEntity();
entity.setDatabaseName(databaseName);
entity.setTableName(tableName);
entity.setComponentId("HiveTextInput");
List<SchemaField> fieldList = new ArrayList<SchemaField>();
SchemaField sf1 = new SchemaField("id", "java.lang.Integer");
SchemaField sf2 = new SchemaField("fname", "java.lang.String");
SchemaField sf3 = new SchemaField("date", "java.util.Date");
SchemaField sf4 = new SchemaField("salary", "java.math.BigDecimal");
sf4.setFieldScale(2);
sf4.setFieldPrecision(6);
sf3.setFieldFormat("yyyy-MM-dd");
fieldList.add(sf1);
fieldList.add(sf2);
fieldList.add(sf3);
fieldList.add(sf4);
entity.setFieldsList(fieldList);
entity.setDelimiter(",");
/*entity.setPartitionKeys(new String[]{"salary"});*/
Properties runtimeProp = new Properties();
runtimeProp.setProperty("prop", "propValue");
entity.setRuntimeProperties(runtimeProp);
List<OutSocket> outSockets = new ArrayList<OutSocket>();
outSockets.add(new OutSocket("outSocket"));
entity.setOutSocketList(outSockets);
FlowDef flowDef = FlowDef.flowDef();
ComponentParameters cpInput = new ComponentParameters();
cpInput.setFlowDef(flowDef);
InputFileHiveTextAssembly inputFileHiveTextAssembly = new InputFileHiveTextAssembly(entity, cpInput);
AssemblyBuildHelper.generateOutputPipes(
inputFileHiveTextAssembly.getOutLink("out", "outSocket", entity.getComponentId()), outPath, flowDef);
Flow<?> flow = new Hadoop2MR1FlowConnector().connect(flowDef);
flow.complete();
CascadingTestCase.validateFieldLength(flow.openSink(), 4);
CascadingTestCase.validateFileLength(flow.openSink(), 6);
}
@Test
public void itShouldTestHiveTextInputFileAssemblyWithPartition() throws IOException {
String databaseName = "devl_fraud";
String tableName = "tPartition";
String outPath = "../elt-command-line/testData/Output/outputFromHiveParquetPartition";
InputFileHiveTextEntity entity = new InputFileHiveTextEntity();
entity.setDatabaseName(databaseName);
entity.setTableName(tableName);
entity.setComponentId("HiveTextInput");
List<SchemaField> fieldList = new ArrayList<SchemaField>();
// Id field should not be present in file as it is partitioned column.
// This is the bug in cascading. This is field is kept as work around.
SchemaField sf1 = new SchemaField("id", "java.lang.Integer");
SchemaField sf2 = new SchemaField("fname", "java.lang.String");
SchemaField sf3 = new SchemaField("lname", "java.util.Date");
SchemaField sf4 = new SchemaField("salary", "java.math.BigDecimal");
sf4.setFieldScale(2);
sf4.setFieldPrecision(6);
sf3.setFieldFormat("yyyy-MM-dd");
fieldList.add(sf1);
fieldList.add(sf2);
fieldList.add(sf3);
fieldList.add(sf4);
String[] partitionField = new String[1];
partitionField[0] = "salary";
entity.setFieldsList(fieldList);
entity.setPartitionKeys(partitionField);
entity.setDelimiter(",");
Properties runtimeProp = new Properties();
runtimeProp.setProperty("prop", "propValue");
entity.setRuntimeProperties(runtimeProp);
List<OutSocket> outSockets = new ArrayList<OutSocket>();
outSockets.add(new OutSocket("outSocket"));
entity.setOutSocketList(outSockets);
FlowDef flowDef = FlowDef.flowDef();
ComponentParameters cpInput = new ComponentParameters();
cpInput.setFlowDef(flowDef);
InputFileHiveTextAssembly inputFileHiveTextAssembly = new InputFileHiveTextAssembly(entity, cpInput);
AssemblyBuildHelper.generateOutputPipes(
inputFileHiveTextAssembly.getOutLink("out", "outSocket", entity.getComponentId()), outPath, flowDef);
Flow<?> flow = new Hadoop2MR1FlowConnector().connect(flowDef);
flow.complete();
CascadingTestCase.validateFieldLength(flow.openSink(), 4);
}
@Test
public void itShouldTestHiveTextInputFileAssemblyWithExternalTable() throws IOException {
String databaseName = "devl_fraud";
String tableName = "t9";
String externalTablePathUri = "../elt-command-line/testData/Output/HiveTextOutputExternalTable";
String outPath = "../elt-command-line/testData/Output/outputFromHiveParquetExternalTable";
InputFileHiveTextEntity entity = new InputFileHiveTextEntity();
entity.setDatabaseName(databaseName);
entity.setTableName(tableName);
entity.setComponentId("HiveTextInput");
entity.setDelimiter(",");
List<SchemaField> fieldList = new ArrayList<SchemaField>();
SchemaField sf1 = new SchemaField("id", "java.lang.Integer");
SchemaField sf2 = new SchemaField("fname", "java.lang.String");
SchemaField sf3 = new SchemaField("lname", "java.util.Date");
SchemaField sf4 = new SchemaField("salary", "java.math.BigDecimal");
sf4.setFieldScale(2);
sf4.setFieldPrecision(6);
sf3.setFieldFormat("yyyy-MM-dd");
fieldList.add(sf1);
fieldList.add(sf2);
fieldList.add(sf3);
fieldList.add(sf4);
/*String[] partitionField = new String[1];
partitionField[0] = "salary";*/
//entity.setPartitionKeys(partitionField);
entity.setFieldsList(fieldList);
entity.setExternalTablePathUri(externalTablePathUri);
Properties runtimeProp = new Properties();
runtimeProp.setProperty("prop", "propValue");
entity.setRuntimeProperties(runtimeProp);
List<OutSocket> outSockets = new ArrayList<OutSocket>();
outSockets.add(new OutSocket("outSocket"));
entity.setOutSocketList(outSockets);
FlowDef flowDef = FlowDef.flowDef();
ComponentParameters cpInput = new ComponentParameters();
cpInput.setFlowDef(flowDef);
InputFileHiveTextAssembly inputFileHiveTextAssembly = new InputFileHiveTextAssembly(entity, cpInput);
AssemblyBuildHelper.generateOutputPipes(
inputFileHiveTextAssembly.getOutLink("out", "outSocket", entity.getComponentId()), outPath, flowDef);
Flow<?> flow = new Hadoop2MR1FlowConnector().connect(flowDef);
flow.complete();
CascadingTestCase.validateFieldLength(flow.openSink(), 4);
}
@Test
public void itShouldTestHiveTextInputFileAssemblyWithExternalShemaAndPartition() throws IOException {
String databaseName = "devl_fraud";
String tableName = "tPartitionExternal";
String externalTablePathUri = "../elt-command-line/testData/Output/HiveParquetOutputPartitionExternalTable";
String outPath = "../elt-command-line/testData/Output/romHiveParquetOutputPartitionExternalTable";
InputFileHiveTextEntity entity = new InputFileHiveTextEntity();
entity.setDatabaseName(databaseName);
entity.setTableName(tableName);
entity.setComponentId("HiveTextInput");
entity.setDelimiter(",");
List<SchemaField> fieldList = new ArrayList<SchemaField>();
SchemaField sf1 = new SchemaField("id", "java.lang.Integer");
SchemaField sf2 = new SchemaField("fname", "java.lang.String");
SchemaField sf3 = new SchemaField("lname", "java.util.Date");
SchemaField sf4 = new SchemaField("salary", "java.math.BigDecimal");
sf4.setFieldScale(2);
sf4.setFieldPrecision(6);
sf3.setFieldFormat("yyyy-MM-dd");
fieldList.add(sf1);
fieldList.add(sf2);
fieldList.add(sf3);
fieldList.add(sf4);
String[] partitionField = new String[1];
partitionField[0] = "salary";
entity.setFieldsList(fieldList);
entity.setPartitionKeys(partitionField);
entity.setExternalTablePathUri(externalTablePathUri);
Properties runtimeProp = new Properties();
runtimeProp.setProperty("prop", "propValue");
entity.setRuntimeProperties(runtimeProp);
List<OutSocket> outSockets = new ArrayList<OutSocket>();
outSockets.add(new OutSocket("outSocket"));
entity.setOutSocketList(outSockets);
FlowDef flowDef = FlowDef.flowDef();
ComponentParameters cpInput = new ComponentParameters();
cpInput.setFlowDef(flowDef);
InputFileHiveTextAssembly inputFileHiveTextAssembly = new InputFileHiveTextAssembly(entity, cpInput);
AssemblyBuildHelper.generateOutputPipes(
inputFileHiveTextAssembly.getOutLink("out", "outSocket", entity.getComponentId()), outPath, flowDef);
Flow<?> flow = new Hadoop2MR1FlowConnector().connect(flowDef);
flow.complete();
CascadingTestCase.validateFieldLength(flow.openSink(), 4);
}
}