/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.falcon.regression.hcat;
import org.apache.commons.lang.StringUtils;
import org.apache.falcon.regression.Entities.FeedMerlin;
import org.apache.falcon.regression.core.bundle.Bundle;
import org.apache.falcon.entity.v0.cluster.Interfacetype;
import org.apache.falcon.entity.v0.EntityType;
import org.apache.falcon.entity.v0.Frequency;
import org.apache.falcon.entity.v0.process.EngineType;
import org.apache.falcon.regression.core.helpers.ColoHelper;
import org.apache.falcon.regression.core.util.AssertUtil;
import org.apache.falcon.regression.core.util.BundleUtil;
import org.apache.falcon.regression.core.util.HCatUtil;
import org.apache.falcon.regression.core.util.HadoopUtil;
import org.apache.falcon.regression.core.util.InstanceUtil;
import org.apache.falcon.regression.core.util.OSUtil;
import org.apache.falcon.regression.core.util.TimeUtil;
import org.apache.falcon.regression.core.util.Util;
import org.apache.falcon.regression.testHelper.BaseTestClass;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.api.HCatAddPartitionDesc;
import org.apache.hive.hcatalog.api.HCatClient;
import org.apache.hive.hcatalog.api.HCatCreateTableDesc;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.log4j.Logger;
import org.apache.oozie.client.CoordinatorAction;
import org.apache.oozie.client.OozieClient;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Tests falcon processes that run hive scripts.
*/
@Test(groups = "embedded")
public class HCatProcessTest extends BaseTestClass {
private static final Logger LOGGER = Logger.getLogger(HCatProcessTest.class);
private final ColoHelper cluster = servers.get(0);
private final FileSystem clusterFS = serverFS.get(0);
private final OozieClient clusterOC = serverOC.get(0);
private HCatClient clusterHC;
private final String baseTestHDFSDir = cleanAndGetTestDir();
private final String hiveScriptDir = baseTestHDFSDir + "/hive";
private final String hiveScriptFile = hiveScriptDir + "/script.hql";
private final String aggregateWorkflowDir = baseTestHDFSDir + "/aggregator";
private final String hiveScriptFileNonHCatInput = hiveScriptDir + "/script_non_hcat_input.hql";
private final String hiveScriptFileNonHCatOutput =
hiveScriptDir + "/script_non_hcat_output.hql";
private final String hiveScriptTwoHCatInputOneHCatOutput =
hiveScriptDir + "/script_two_hcat_input_one_hcat_output.hql";
private final String hiveScriptOneHCatInputTwoHCatOutput =
hiveScriptDir + "/script_one_hcat_input_two_hcat_output.hql";
private final String hiveScriptTwoHCatInputTwoHCatOutput =
hiveScriptDir + "/script_two_hcat_input_two_hcat_output.hql";
private final String inputHDFSDir = baseTestHDFSDir + "/input";
private final String inputHDFSDir2 = baseTestHDFSDir + "/input2";
private final String outputHDFSDir = baseTestHDFSDir + "/output";
private final String outputHDFSDir2 = baseTestHDFSDir + "/output2";
private final String dbName = "default";
private final String inputTableName = "hcatprocesstest_input_table";
private final String inputTableName2 = "hcatprocesstest_input_table2";
private final String outputTableName = "hcatprocesstest_output_table";
private final String outputTableName2 = "hcatprocesstest_output_table2";
private final String col1Name = "id";
private final String col2Name = "value";
private final String partitionColumn = "dt";
private final String hcatDir = OSUtil.concat("src", "test", "resources", "hcat");
private final String localHCatData = OSUtil.concat(hcatDir, "data");
private final String hiveScript = OSUtil.concat(hcatDir, "hivescript");
private final String startDate = "2010-01-01T20:00Z";
private final String endDate = "2010-01-01T21:10Z";
@BeforeMethod(alwaysRun = true)
public void setUp() throws Exception {
clusterHC = cluster.getClusterHelper().getHCatClient();
bundles[0] = BundleUtil.readHCatBundle();
bundles[0] = new Bundle(bundles[0], cluster);
bundles[0].generateUniqueBundle(this);
bundles[0].setProcessWorkflow(hiveScriptFile, EngineType.HIVE);
bundles[0].setClusterInterface(Interfacetype.REGISTRY,
cluster.getClusterHelper().getHCatEndpoint());
HadoopUtil.deleteDirIfExists(baseTestHDFSDir, clusterFS);
HadoopUtil.uploadDir(clusterFS, hiveScriptDir, hiveScript);
HadoopUtil.uploadDir(clusterFS, aggregateWorkflowDir, OSUtil.RESOURCES_OOZIE);
HadoopUtil.recreateDir(clusterFS, outputHDFSDir);
HadoopUtil.recreateDir(clusterFS, outputHDFSDir2);
clusterHC.dropTable(dbName, inputTableName, true);
clusterHC.dropTable(dbName, inputTableName2, true);
clusterHC.dropTable(dbName, outputTableName, true);
clusterHC.dropTable(dbName, outputTableName2, true);
}
@DataProvider
public String[][] generateSeparators() {
//disabling till FALCON-372 is fixed
//return new String[][] {{"-"}, {"/"}};
return new String[][]{{"-", }, };
}
@Test(dataProvider = "generateSeparators")
public void oneHCatInputOneHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir)
.build());
addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String inputTableUri =
"catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment;
bundles[0].setInputFeedTableUri(inputTableUri);
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
String outputTableUri =
"catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment;
bundles[0].setOutputFeedTableUri(outputTableUri);
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
AssertUtil.checkContentSize(inputHDFSDir + "/" + dataDates.get(0),
outputHDFSDir + "/dt=" + dataDates.get(0), clusterFS);
}
@Test(dataProvider = "generateSeparators")
public void twoHCatInputOneHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
final List<String> dataset2 = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir2, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName2, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir2)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir)
.build());
addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName);
addPartitionsToTable(dataDates, dataset2, "dt", dbName, inputTableName2);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String inputTableUri =
"catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment;
String inputTableUri2 =
"catalog:" + dbName + ":" + inputTableName2 + tableUriPartitionFragment;
bundles[0].setInputFeedTableUri(inputTableUri);
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
final String inputFeed1 = bundles[0].getInputFeedFromBundle();
final String inputFeed2Name = Util.readEntityName(inputFeed1) + "-second";
FeedMerlin feedObj = new FeedMerlin(inputFeed1);
feedObj.setName(inputFeed2Name);
feedObj.getTable().setUri(inputTableUri2);
bundles[0].addInputFeedToBundle("inputData2", feedObj);
String outputTableUri =
"catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment;
bundles[0].setOutputFeedTableUri(outputTableUri);
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].setProcessWorkflow(hiveScriptTwoHCatInputOneHCatOutput, EngineType.HIVE);
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
final ContentSummary inputContentSummary =
clusterFS.getContentSummary(new Path(inputHDFSDir + "/" + dataDates.get(0)));
final ContentSummary inputContentSummary2 =
clusterFS.getContentSummary(new Path(inputHDFSDir2 + "/" + dataDates.get(0)));
final ContentSummary outputContentSummary =
clusterFS.getContentSummary(new Path(outputHDFSDir + "/dt=" + dataDates.get(0)));
LOGGER.info("inputContentSummary = " + inputContentSummary.toString(false));
LOGGER.info("inputContentSummary2 = " + inputContentSummary2.toString(false));
LOGGER.info("outputContentSummary = " + outputContentSummary.toString(false));
Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(),
outputContentSummary.getLength(),
"Unexpected size of the output.");
}
@Test(dataProvider = "generateSeparators")
public void oneHCatInputTwoHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName2, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir2)
.build());
addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String inputTableUri =
"catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment;
bundles[0].setInputFeedTableUri(inputTableUri);
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
String outputTableUri =
"catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment;
String outputTableUri2 =
"catalog:" + dbName + ":" + outputTableName2 + tableUriPartitionFragment;
bundles[0].setOutputFeedTableUri(outputTableUri);
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
final String outputFeed1 = bundles[0].getInputFeedFromBundle();
final String outputFeed2Name = Util.readEntityName(outputFeed1) + "-second";
FeedMerlin feedObj = new FeedMerlin(outputFeed1);
feedObj.setName(outputFeed2Name);
feedObj.getTable().setUri(outputTableUri2);
bundles[0].addOutputFeedToBundle("outputData2", feedObj);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].setProcessWorkflow(hiveScriptOneHCatInputTwoHCatOutput, EngineType.HIVE);
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
AssertUtil.checkContentSize(inputHDFSDir + "/" + dataDates.get(0),
outputHDFSDir + "/dt=" + dataDates.get(0), clusterFS);
AssertUtil.checkContentSize(inputHDFSDir + "/" + dataDates.get(0),
outputHDFSDir2 + "/dt=" + dataDates.get(0), clusterFS);
}
@Test(dataProvider = "generateSeparators")
public void twoHCatInputTwoHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
final List<String> dataset2 = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir2, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName2, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir2)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir)
.build());
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName2, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir2)
.build());
addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName);
addPartitionsToTable(dataDates, dataset2, "dt", dbName, inputTableName2);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String inputTableUri =
"catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment;
String inputTableUri2 =
"catalog:" + dbName + ":" + inputTableName2 + tableUriPartitionFragment;
bundles[0].setInputFeedTableUri(inputTableUri);
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
final String inputFeed1 = bundles[0].getInputFeedFromBundle();
final String inputFeed2Name = Util.readEntityName(inputFeed1) + "-second";
FeedMerlin feedObj = new FeedMerlin(inputFeed1);
feedObj.setName(inputFeed2Name);
feedObj.getTable().setUri(inputTableUri2);
bundles[0].addInputFeedToBundle("inputData2", feedObj);
String outputTableUri =
"catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment;
String outputTableUri2 =
"catalog:" + dbName + ":" + outputTableName2 + tableUriPartitionFragment;
bundles[0].setOutputFeedTableUri(outputTableUri);
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
final String outputFeed1 = bundles[0].getOutputFeedFromBundle();
final String outputFeed2Name = Util.readEntityName(outputFeed1) + "-second";
FeedMerlin feedObj2 = new FeedMerlin(outputFeed1);
feedObj2.setName(outputFeed2Name);
feedObj2.getTable().setUri(outputTableUri2);
bundles[0].addOutputFeedToBundle("outputData2", feedObj2);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].setProcessWorkflow(hiveScriptTwoHCatInputTwoHCatOutput, EngineType.HIVE);
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
final ContentSummary inputContentSummary =
clusterFS.getContentSummary(new Path(inputHDFSDir + "/" + dataDates.get(0)));
final ContentSummary inputContentSummary2 =
clusterFS.getContentSummary(new Path(inputHDFSDir2 + "/" + dataDates.get(0)));
final ContentSummary outputContentSummary =
clusterFS.getContentSummary(new Path(outputHDFSDir + "/dt=" + dataDates.get(0)));
final ContentSummary outputContentSummary2 =
clusterFS.getContentSummary(new Path(outputHDFSDir2 + "/dt=" + dataDates.get(0)));
LOGGER.info("inputContentSummary = " + inputContentSummary.toString(false));
LOGGER.info("inputContentSummary2 = " + inputContentSummary2.toString(false));
LOGGER.info("outputContentSummary = " + outputContentSummary.toString(false));
LOGGER.info("outputContentSummary2 = " + outputContentSummary2.toString(false));
Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(),
outputContentSummary.getLength(),
"Unexpected size of the output.");
Assert.assertEquals(inputContentSummary.getLength() + inputContentSummary2.getLength(),
outputContentSummary2.getLength(),
"Unexpected size of the output.");
}
@Test(dataProvider = "generateSeparators")
public void oneHCatInputOneNonHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil
.flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, inputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(inputHDFSDir)
.build());
addPartitionsToTable(dataDates, dataset, "dt", dbName, inputTableName);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String inputTableUri =
"catalog:" + dbName + ":" + inputTableName + tableUriPartitionFragment;
bundles[0].setInputFeedTableUri(inputTableUri);
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
//
FeedMerlin nonHCatFeed = new FeedMerlin(BundleUtil.readELBundle().getOutputFeedFromBundle());
final String outputFeedName = bundles[0].getOutputFeedNameFromBundle();
nonHCatFeed.setName(outputFeedName);
final List<String> clusterNames = bundles[0].getClusterNames();
Assert.assertEquals(clusterNames.size(), 1, "Expected only one cluster in the bundle.");
nonHCatFeed.setClusterNameInFeed(clusterNames.get(0), 0);
bundles[0].writeFeedElement(nonHCatFeed, outputFeedName);
bundles[0].setOutputFeedLocationData(outputHDFSDir + "/"
+ StringUtils.join(new String[]{"${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator));
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].setProcessWorkflow(hiveScriptFileNonHCatOutput, EngineType.HIVE);
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
AssertUtil.checkContentSize(inputHDFSDir + "/" + dataDates.get(0),
outputHDFSDir + "/" + dataDates.get(0), clusterFS);
}
@Test(dataProvider = "generateSeparators")
public void oneNonCatInputOneHCatOutput(String separator) throws Exception {
/* upload data and create partition */
final String datePattern =
StringUtils.join(new String[]{"yyyy", "MM", "dd", "HH"}, separator);
List<String> dataDates = getDatesList(startDate, endDate, datePattern, 60);
final List<String> dataset = HadoopUtil.
flattenAndPutDataInFolder(clusterFS, localHCatData, inputHDFSDir, dataDates);
ArrayList<HCatFieldSchema> cols = new ArrayList<>();
cols.add(HCatUtil.getStringSchema(col1Name, col1Name + " comment"));
cols.add(HCatUtil.getStringSchema(col2Name, col2Name + " comment"));
ArrayList<HCatFieldSchema> partitionCols = new ArrayList<>();
partitionCols.add(HCatUtil.getStringSchema(partitionColumn, partitionColumn + " partition"));
clusterHC.createTable(HCatCreateTableDesc
.create(dbName, outputTableName, cols)
.partCols(partitionCols)
.ifNotExists(true)
.isTableExternal(true)
.location(outputHDFSDir)
.build());
FeedMerlin nonHCatFeed = new FeedMerlin(BundleUtil.readELBundle().getInputFeedFromBundle());
final String inputFeedName = bundles[0].getInputFeedNameFromBundle();
nonHCatFeed.setName(inputFeedName);
final List<String> clusterNames = bundles[0].getClusterNames();
Assert.assertEquals(clusterNames.size(), 1, "Expected only one cluster in the bundle.");
nonHCatFeed.setClusterNameInFeed(clusterNames.get(0), 0);
bundles[0].writeFeedElement(nonHCatFeed, inputFeedName);
bundles[0].setInputFeedDataPath(inputHDFSDir + "/"
+ StringUtils.join(new String[]{"${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator));
bundles[0].setInputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setInputFeedValidity(startDate, endDate);
final String tableUriPartitionFragment = StringUtils.join(
new String[]{"#dt=${YEAR}", "${MONTH}", "${DAY}", "${HOUR}"}, separator);
String outputTableUri =
"catalog:" + dbName + ":" + outputTableName + tableUriPartitionFragment;
bundles[0].setOutputFeedTableUri(outputTableUri);
bundles[0].setOutputFeedPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setOutputFeedValidity(startDate, endDate);
bundles[0].setProcessWorkflow(hiveScriptFileNonHCatInput, EngineType.HIVE);
bundles[0].setProcessValidity(startDate, endDate);
bundles[0].setProcessPeriodicity(1, Frequency.TimeUnit.hours);
bundles[0].setProcessInputStartEnd("now(0,0)", "now(0,0)");
bundles[0].submitFeedsScheduleProcess();
InstanceUtil.waitTillInstanceReachState(
clusterOC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED,
EntityType.PROCESS);
AssertUtil.checkContentSize(inputHDFSDir + "/" + dataDates.get(0),
outputHDFSDir + "/dt=" + dataDates.get(0), clusterFS);
}
private void addPartitionsToTable(List<String> partitions, List<String> partitionLocations,
String partitionCol,
String databaseName, String tableName) throws HCatException {
Assert.assertEquals(partitions.size(), partitionLocations.size(),
"Number of locations is not same as number of partitions.");
final List<HCatAddPartitionDesc> partitionDesc = new ArrayList<>();
for (int i = 0; i < partitions.size(); ++i) {
final String partition = partitions.get(i);
final Map<String, String> onePartition = new HashMap<>();
onePartition.put(partitionCol, partition);
final String partitionLoc = partitionLocations.get(i);
partitionDesc.add(
HCatAddPartitionDesc.create(databaseName, tableName, partitionLoc, onePartition).build());
}
clusterHC.addPartitions(partitionDesc);
}
public static List<String> getDatesList(String startDate, String endDate, String datePattern,
int skipMinutes) {
DateTime startDateJoda = new DateTime(TimeUtil.oozieDateToDate(startDate));
DateTime endDateJoda = new DateTime(TimeUtil.oozieDateToDate(endDate));
DateTimeFormatter formatter = DateTimeFormat.forPattern(datePattern);
LOGGER.info("generating data between " + formatter.print(startDateJoda) + " and "
+ formatter.print(endDateJoda));
List<String> dates = new ArrayList<>();
dates.add(formatter.print(startDateJoda));
while (!startDateJoda.isAfter(endDateJoda)) {
startDateJoda = startDateJoda.plusMinutes(skipMinutes);
dates.add(formatter.print(startDateJoda));
}
return dates;
}
@AfterMethod(alwaysRun = true)
public void tearDown() {
removeTestClassEntities();
}
}