/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.falcon.regression.prism;
import org.apache.falcon.entity.v0.EntityType;
import org.apache.falcon.entity.v0.Frequency;
import org.apache.falcon.entity.v0.feed.LateArrival;
import org.apache.falcon.entity.v0.feed.Lifecycle;
import org.apache.falcon.entity.v0.feed.Properties;
import org.apache.falcon.entity.v0.feed.Property;
import org.apache.falcon.entity.v0.feed.RetentionStage;
import org.apache.falcon.regression.Entities.FeedMerlin;
import org.apache.falcon.regression.core.bundle.Bundle;
import org.apache.falcon.regression.core.enumsAndConstants.FreqType;
import org.apache.falcon.regression.core.enumsAndConstants.RetentionUnit;
import org.apache.falcon.regression.core.helpers.ColoHelper;
import org.apache.falcon.regression.core.response.ServiceResponse;
import org.apache.falcon.regression.core.supportClasses.JmsMessageConsumer;
import org.apache.falcon.regression.core.util.AssertUtil;
import org.apache.falcon.regression.core.util.BundleUtil;
import org.apache.falcon.regression.core.util.HadoopUtil;
import org.apache.falcon.regression.core.util.MatrixUtil;
import org.apache.falcon.regression.core.util.OSUtil;
import org.apache.falcon.regression.core.util.OozieUtil;
import org.apache.falcon.regression.core.util.TimeUtil;
import org.apache.falcon.regression.core.util.Util;
import org.apache.falcon.regression.testHelper.BaseTestClass;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.authentication.client.AuthenticationException;
import org.apache.log4j.Logger;
import org.apache.oozie.client.CoordinatorJob;
import org.apache.oozie.client.OozieClient;
import org.apache.oozie.client.OozieClientException;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.json.JSONException;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import javax.jms.JMSException;
import javax.jms.MapMessage;
import java.io.IOException;
import java.net.URISyntaxException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
/**
* Test with retention.
*/
@Test(groups = { "distributed", "embedded" })
public class RetentionTest extends BaseTestClass {
private static final String TEST_FOLDERS = "/testFolders/";
private String baseTestHDFSDir = cleanAndGetTestDir();
private String testHDFSDir = baseTestHDFSDir + TEST_FOLDERS;
private static final Logger LOGGER = Logger.getLogger(RetentionTest.class);
private ColoHelper cluster = servers.get(0);
private FileSystem clusterFS = serverFS.get(0);
private OozieClient clusterOC = serverOC.get(0);
private static final int[] GAPS = new int[]{2, 4, 5, 1};
@BeforeMethod(alwaysRun = true)
public void setup() throws Exception {
Bundle bundle = BundleUtil.readRetentionBundle();
bundles[0] = new Bundle(bundle, cluster);
bundles[0].setInputFeedDataPath(testHDFSDir);
bundles[0].generateUniqueBundle(this);
bundles[0].submitClusters(prism);
}
@AfterMethod(alwaysRun = true)
public void tearDown() throws Exception {
removeTestClassEntities();
}
/**
* Particular test case for https://issues.apache.org/jira/browse/FALCON-321.
* @throws Exception
*/
@Test
public void testRetentionWithEmptyDirectories() throws Exception {
testRetention(24, RetentionUnit.HOURS, true, FreqType.DAILY, false);
}
/**
* Tests retention with different parameters. Validates its results based on expected and
* actual retained data.
*
* @param retentionPeriod period for which data should be retained
* @param retentionUnit type of retention limit attribute
* @param gaps defines gaps within list of data folders
* @param freqType feed type
* @param withData should folders be filled with data or not
* @throws Exception
*/
@Test(groups = {"0.1", "0.2", "prism"}, dataProvider = "betterDP", priority = -1)
public void testRetention(final int retentionPeriod, final RetentionUnit retentionUnit,
final boolean gaps, final FreqType freqType, final boolean withData) throws Exception {
bundles[0].setInputFeedDataPath(testHDFSDir + freqType.getPathValue());
final FeedMerlin feedObject = new FeedMerlin(bundles[0].getInputFeedFromBundle());
feedObject.setRetentionValue(retentionUnit.getValue() + "(" + retentionPeriod + ")");
final ServiceResponse response = prism.getFeedHelper().submitEntity(feedObject.toString());
if (retentionPeriod > 0) {
AssertUtil.assertSucceeded(response);
replenishData(freqType, gaps, withData);
commonDataRetentionWorkflow(feedObject.toString(), freqType, retentionUnit,
retentionPeriod);
} else {
AssertUtil.assertFailed(response);
}
}
/**
* Generates folders based on proposed periodicity and then fills them with data if required.
*
* @param freqType feed retention limit type
* @param gap defines what amount of units should be skipped
* @param withData should folders be filled with data or not
* @throws Exception
*/
private void replenishData(FreqType freqType, boolean gap, boolean withData) throws Exception {
int skip = 1;
if (gap) {
skip = GAPS[new Random().nextInt(GAPS.length)];
}
final DateTime today = new DateTime(DateTimeZone.UTC);
final List<DateTime> times = TimeUtil.getDatesOnEitherSide(
freqType.addTime(today, -36), freqType.addTime(today, -1), skip, freqType);
final List<String> dataDates = TimeUtil.convertDatesToString(times, freqType.getFormatter());
LOGGER.info("dataDates = " + dataDates);
dataDates.add(HadoopUtil.SOMETHING_RANDOM);
if (withData) {
HadoopUtil.flattenAndPutDataInFolder(clusterFS, OSUtil.SINGLE_FILE, testHDFSDir, dataDates);
} else {
HadoopUtil.createFolders(clusterFS, testHDFSDir, dataDates);
}
}
/**
* Schedules feed and waits till retention succeeds. Makes validation of data which was removed
* and which was retained.
*
* @param feed analyzed retention feed
* @param freqType feed type
* @param retentionUnit type of retention limit attribute
* @param retentionPeriod period for which data should be retained
* @throws OozieClientException
* @throws IOException
* @throws URISyntaxException
* @throws AuthenticationException
* @throws JMSException
*/
private void commonDataRetentionWorkflow(String feed, FreqType freqType,
RetentionUnit retentionUnit, int retentionPeriod) throws OozieClientException,
IOException, URISyntaxException, AuthenticationException, JMSException,
InterruptedException {
//get Data created in the cluster
List<String> initialData = Util.getHadoopDataFromDir(clusterFS, feed, testHDFSDir);
cluster.getFeedHelper().schedule(feed);
LOGGER.info(cluster.getClusterHelper().getActiveMQ());
final String feedName = Util.readEntityName(feed);
LOGGER.info(feedName);
JmsMessageConsumer messageConsumer = new JmsMessageConsumer("FALCON." + feedName,
cluster.getClusterHelper().getActiveMQ());
messageConsumer.start();
String bundleId = OozieUtil.getBundles(clusterOC, feedName, EntityType.FEED).get(0);
List<String> workflows = OozieUtil.waitForRetentionWorkflowToSucceed(bundleId, clusterOC);
//get current time minus duration of last status check - to get actual time when eviction has started
final DateTime currentTime = new DateTime(DateTimeZone.UTC).minus(10000);
LOGGER.info("Current time is " + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(currentTime.toDate()));
LOGGER.info("workflows: " + workflows);
messageConsumer.interrupt();
Util.printMessageData(messageConsumer);
//now look for cluster data
List<String> finalData = Util.getHadoopDataFromDir(clusterFS, feed, testHDFSDir);
//now see if retention value was matched to as expected
List<String> expectedOutput = filterDataOnRetention(initialData, currentTime, retentionUnit,
retentionPeriod, freqType);
LOGGER.info("initialData = " + initialData);
LOGGER.info("finalData = " + finalData);
LOGGER.info("expectedOutput = " + expectedOutput);
final List<String> missingData = new ArrayList<>(initialData);
missingData.removeAll(expectedOutput);
validateDataFromFeedQueue(feedName, messageConsumer.getReceivedMessages(), missingData);
Assert.assertEquals(finalData.size(), expectedOutput.size(),
"Expected and actual sizes of retained data are different! Please check.");
Assert.assertTrue(Arrays.deepEquals(finalData.toArray(new String[finalData.size()]),
expectedOutput.toArray(new String[expectedOutput.size()])));
//check that root directory exists
Assert.assertTrue(clusterFS.exists(new Path(testHDFSDir)), "Base data directory should be present.");
}
/**
* Makes validation based on comparison of data which is expected to be removed with data
* mentioned in messages from ActiveMQ.
*
* @param feedName feed name
* @param messages messages from ActiveMQ
* @param missingData data which is expected to be removed after retention succeeded
* @throws OozieClientException
* @throws JMSException
*/
private void validateDataFromFeedQueue(String feedName, List<MapMessage> messages,
List<String> missingData) throws OozieClientException, JMSException {
//just verify that each element in queue is same as deleted data!
List<String> workflowIds = OozieUtil.getWorkflowJobs(clusterOC,
OozieUtil.getBundles(clusterOC, feedName, EntityType.FEED).get(0));
//create queue data folderList:
List<String> deletedFolders = new ArrayList<>();
for (MapMessage message : messages) {
if (message != null) {
Assert.assertEquals(message.getString("entityName"), feedName);
String[] splitData = message.getString("feedInstancePaths").split(TEST_FOLDERS);
deletedFolders.add(splitData[splitData.length - 1]);
Assert.assertEquals(message.getString("operation"), "DELETE");
Assert.assertEquals(message.getString("workflowId"), workflowIds.get(0));
//verify other data also
Assert.assertEquals(message.getJMSDestination().toString(), "topic://FALCON." + feedName);
Assert.assertEquals(message.getString("status"), "SUCCEEDED");
}
}
Assert.assertEquals(deletedFolders.size(), missingData.size(),
"Output size is different than expected!");
Assert.assertTrue(Arrays.deepEquals(missingData.toArray(new String[missingData.size()]),
deletedFolders.toArray(new String[deletedFolders.size()])),
"The missing data and message for delete operation don't correspond");
}
/**
* Evaluates amount of data which is expected to be retained.
*
* @param inputData initial data on cluster
* @param currentTime current date
* @param retentionUnit type of retention limit attribute
* @param retentionPeriod period for which data should be retained
* @param freqType feed type
* @return list of data folders which are expected to be present on cluster
*/
private List<String> filterDataOnRetention(List<String> inputData, DateTime currentTime,
RetentionUnit retentionUnit, int retentionPeriod, FreqType freqType) {
final List<String> finalData = new ArrayList<>();
//end date is today's date
final String startLimit = freqType.getFormatter().print(
retentionUnit.minusTime(currentTime, retentionPeriod));
//now to actually check!
for (String testDate : inputData) {
if (testDate.equals(HadoopUtil.SOMETHING_RANDOM)
|| testDate.compareTo(startLimit) > 0) {
finalData.add(testDate);
}
}
return finalData;
}
/**
* Provides different sets of parameters for retention workflow.
*/
@DataProvider(name = "betterDP")
public Object[][] getTestData() {
// a negative value like -4 should be covered in validation scenarios.
Integer[] retentionPeriods = new Integer[]{0, 10080, 60, 8, 24};
RetentionUnit[] retentionUnits = new RetentionUnit[]{
RetentionUnit.HOURS,
RetentionUnit.DAYS,
}; // "minutes","hours", "days",
Boolean[] gaps = new Boolean[]{false, true};
FreqType[] freqTypes = new FreqType[]{FreqType.DAILY, FreqType.YEARLY, FreqType.MONTHLY};
final Boolean[] withData = new Boolean[]{true};
return MatrixUtil.crossProduct(retentionPeriods, retentionUnits, gaps, freqTypes, withData);
}
/**
* Submit a feed having minutely lifecycle frequency.
* It would fail since lifecycle retention frequency has to be >= 1 hour.
*/
@Test
public void testTooFrequentRetentionLifecycleStage() throws Exception {
String startTime = TimeUtil.getTimeWrtSystemTime(0);
String endTime = TimeUtil.addMinsToTime(startTime, 120);
LateArrival lateArrival = new LateArrival();
lateArrival.setCutOff(new Frequency("1", Frequency.TimeUnit.minutes));
FreqType freqType = FreqType.MINUTELY;
Frequency retentionPeriodGlobal=new Frequency("30", Frequency.TimeUnit.minutes);
Frequency retentionFrequencyGlobal=new Frequency("15", Frequency.TimeUnit.minutes);
bundles[0].setInputFeedDataPath(testHDFSDir + freqType.getPathValue());
final FeedMerlin feedObject = new FeedMerlin(bundles[0].getInputFeedFromBundle());
feedObject.setLateArrival(lateArrival);
feedObject.setValidity(startTime, endTime);
feedObject.setFrequency(new Frequency("minutes(10)"));
feedObject.setRetentionValue("minutes(10)");
feedObject.setLifecycle(createLifecycle(retentionPeriodGlobal, retentionFrequencyGlobal,
"", "", true));
final ServiceResponse response = prism.getFeedHelper().submitEntity(feedObject.toString());
AssertUtil.assertFailedWithMessage(response, "Feed Retention can not be more frequent than hours(1)");
}
/**
* Submits and schedules a feed with lifecycle tag at cluster and global level.
* Responses are checked and retention is validated correspondingly.
* Uses lifecycleDPFail dataProvider to handle possible scenarios.
*
* @param globalLevel : boolean (whether lifecycle is enabled for global level or not)
* @param clusterLevel : boolean (whether lifecycle is enabled for cluster level or not)
*/
@Test(dataProvider = "lifecycleDPFail")
public void clusterGlobalNoRetentionStageTest(boolean globalLevel, boolean clusterLevel) throws Exception {
String startTime = TimeUtil.getTimeWrtSystemTime(0);
String endTime = TimeUtil.addMinsToTime(startTime, 120);
LateArrival lateArrival = new LateArrival();
lateArrival.setCutOff(new Frequency("1", Frequency.TimeUnit.hours));
final FeedMerlin feedObject = new FeedMerlin(bundles[0].getInputFeedFromBundle());
feedObject.setLateArrival(lateArrival);
feedObject.setValidity(startTime, endTime);
if (globalLevel) {
feedObject.setLifecycle(new Lifecycle());
}
if (clusterLevel) {
feedObject.getClusters().getClusters().get(0).setLifecycle(new Lifecycle());
}
final ServiceResponse response = prism.getFeedHelper().submitEntity(feedObject.toString());
AssertUtil.assertFailedWithMessage(response, "Retention is a mandatory stage, didn't find it for cluster");
}
@DataProvider(name = "lifecycleDPFail")
public Object[][] getLifecycleFail() {
return new Object[][]{
{true, true}, // cluster/global : No retention stage. Should fail.
{true, false}, // global : no retention stage. Should fail.
{false, true}, // cluster : no retention stage.Should fail.
};
}
/**
* Submits and schedules a feed with lifecycle tag at cluster and global level.
* Responses are checked and retention is validated correspondingly.
* Uses getLifecycleWithGlobalStage dataProvider to handle possible scenarios.
*
* @param globalLevel : boolean (whether lifecycle is enabled for global level or not)
* @param clusterLevel : boolean (whether lifecycle is enabled for cluster level or not)
*/
@Test(dataProvider = "getLifecycleWithGlobalStage")
public void retentionStageFromGlobalTest(boolean globalLevel, boolean clusterLevel) throws Exception {
String startTime = TimeUtil.getTimeWrtSystemTime(0);
String endTime = TimeUtil.addMinsToTime(startTime, 120);
FreqType freqType = FreqType.HOURLY;
Frequency retentionPeriodGlobal=new Frequency("2", Frequency.TimeUnit.hours);
Frequency retentionFrequencyGlobal=new Frequency("1", Frequency.TimeUnit.hours);
String priorityGlobal = "HIGH";
String queue = "default";
LateArrival lateArrival = new LateArrival();
lateArrival.setCutOff(new Frequency("1", Frequency.TimeUnit.hours));
bundles[0].setInputFeedDataPath(testHDFSDir + freqType.getPathValue());
final FeedMerlin feedObject = new FeedMerlin(bundles[0].getInputFeedFromBundle());
feedObject.setLateArrival(lateArrival);
feedObject.setValidity(startTime, endTime);
if (globalLevel) {
feedObject.setLifecycle(createLifecycle(retentionPeriodGlobal, retentionFrequencyGlobal,
priorityGlobal, queue, true));
}
if (clusterLevel) {
feedObject.getClusters().getClusters().get(0).setLifecycle(new Lifecycle());
}
replenishData(freqType, false, false);
final ServiceResponse response = prism.getFeedHelper().submitEntity(feedObject.toString());
AssertUtil.assertSucceeded(response);
commonDataRetentionWorkflow(feedObject.toString(), freqType, RetentionUnit.HOURS,
retentionPeriodGlobal.getFrequencyAsInt());
validateFrequency(feedObject.getName(), retentionFrequencyGlobal.getFrequencyAsInt()*60);
validatePriorityAndQueue(feedObject.getName(), priorityGlobal, queue);
}
@DataProvider(name = "getLifecycleWithGlobalStage")
public Object[][] getLifecycleWithGlobalStage() {
return new Object[][]{
{true, false}, // Global level lifecycle. Should pass.
{true, true}, // Cluster level : no retention stage - (pick from global). Should pass.
};
}
/**
* Submits and schedules a feed with lifecycle tag at cluster and global level.
* Responses are checked and retention is validated correspondingly.
* Uses getLifecycleWithClusterStage dataProvider to handle possible scenarios.
*
* @param globalLevel : boolean (whether lifecycle is enabled for global level or not)
* @param globalWithStage : boolean (whether global lifecycle has retention stage defined or not)
*/
@Test(dataProvider = "getLifecycleWithClusterStage")
public void retentionStageFromClusterTest(boolean globalLevel, boolean globalWithStage) throws Exception {
String startTime = TimeUtil.getTimeWrtSystemTime(0);
String endTime = TimeUtil.addMinsToTime(startTime, 120);
FreqType freqType = FreqType.HOURLY;
Frequency retentionPeriodGlobal=new Frequency("2", Frequency.TimeUnit.hours);
Frequency retentionFrequencyGlobal=new Frequency("1", Frequency.TimeUnit.hours);
Frequency retentionPeriodCluster=new Frequency("4", Frequency.TimeUnit.hours);
Frequency retentionFrequencyCluster=new Frequency("3", Frequency.TimeUnit.hours);
String priorityGlobal = "HIGH";
String priorityCluster = "LOW";
String queue = "default";
LateArrival lateArrival = new LateArrival();
lateArrival.setCutOff(new Frequency("1", Frequency.TimeUnit.hours));
bundles[0].setInputFeedDataPath(testHDFSDir + freqType.getPathValue());
final FeedMerlin feedObject = new FeedMerlin(bundles[0].getInputFeedFromBundle());
feedObject.setLateArrival(lateArrival);
feedObject.setValidity(startTime, endTime);
if (globalLevel) {
feedObject.setLifecycle(createLifecycle(retentionPeriodGlobal, retentionFrequencyGlobal,
priorityGlobal, queue, globalWithStage));
}
feedObject.getClusters().getClusters().get(0).setLifecycle(createLifecycle(retentionPeriodCluster,
retentionFrequencyCluster, priorityCluster, queue, true));
replenishData(freqType, false, false);
final ServiceResponse response = prism.getFeedHelper().submitEntity(feedObject.toString());
AssertUtil.assertSucceeded(response);
commonDataRetentionWorkflow(feedObject.toString(), freqType, RetentionUnit.HOURS,
retentionPeriodCluster.getFrequencyAsInt());
validateFrequency(feedObject.getName(), retentionFrequencyCluster.getFrequencyAsInt()*60);
validatePriorityAndQueue(feedObject.getName(), priorityCluster, queue);
}
@DataProvider(name = "getLifecycleWithClusterStage")
public Object[][] getLifecycleWithClusterStage() {
return new Object[][]{
{true, true}, // Cluster level lifecylce. Should pass.
{false, false}, // Cluster level with no global level lifecylce. Should pass.
{true, false}, // Cluster level with empty global level lifecycle.Should pass.
};
}
/**
* Method to create lifecycle tag to be used by feed for lifecycle retention.
* @param retentionPeriod : lifecycle retention period.
* @param retentionFrequency : lifecycle retention frequency.
* @param priority : lifecycle retention priority.
* @param queue : lifecycle retention queue.
*/
private Lifecycle createLifecycle(Frequency retentionPeriod, Frequency retentionFrequency,
String priority, String queue, boolean stage) {
Lifecycle lifecycle = new Lifecycle();
if (stage) {
String limitPropertyName = "retention.policy.agebaseddelete.limit";
Property property = new Property();
property.setName(limitPropertyName);
property.setValue(retentionPeriod.getTimeUnit() + "(" + retentionPeriod.getFrequencyAsInt() + ")");
Properties properties = new Properties();
properties.getProperties().add(property);
RetentionStage retentionStage = new RetentionStage();
retentionStage.setFrequency(new Frequency(retentionFrequency.getTimeUnit()
+ "(" + retentionFrequency.getFrequencyAsInt() + ")"));
if (!priority.isEmpty()) {
retentionStage.setPriority(priority);
}
if (!queue.isEmpty()) {
retentionStage.setQueue(queue);
}
retentionStage.setProperties(properties);
lifecycle.setRetentionStage(retentionStage);
}
return lifecycle;
}
/**
* Validates feed retention frequency with expected frequency.
* @param feedName : feed name.
* @param frequency : expected frequency.
*/
private void validateFrequency(String feedName, int frequency)
throws OozieClientException, JMSException, JSONException {
List<CoordinatorJob> coordJobs = OozieUtil.getBundleCoordinators(clusterOC,
OozieUtil.getBundles(clusterOC, feedName, EntityType.FEED).get(0));
CoordinatorJob coordJobInfo = clusterOC.getCoordJobInfo(coordJobs.get(0).getId());
Assert.assertEquals(coordJobInfo.getFrequency(), String.valueOf(frequency),
"Invalid retention frequency : " + frequency);
}
/**
* Validates feed retention queue and priority with expected values.
* @param feedName : feed name.
* @param expectedPriority : expected priority.
* @param expectedQueue : expected queue.
*/
private void validatePriorityAndQueue(String feedName, String expectedPriority, String expectedQueue)
throws OozieClientException, JMSException, JSONException {
Configuration configuration = OozieUtil.getRetentionConfiguration(clusterOC,
OozieUtil.getBundles(clusterOC, feedName, EntityType.FEED).get(0));
String priority = configuration.get("jobPriority");
String queue = configuration.get("queueName");
Assert.assertEquals(priority, expectedPriority, "Priority should be : " + expectedPriority);
Assert.assertEquals(queue, expectedQueue, "Queue should be : " + expectedQueue);
}
}