/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.falcon.regression; import org.apache.falcon.entity.v0.EntityType; import org.apache.falcon.entity.v0.Frequency; import org.apache.falcon.entity.v0.process.Input; import org.apache.falcon.entity.v0.process.LateInput; import org.apache.falcon.entity.v0.process.LateProcess; import org.apache.falcon.entity.v0.process.PolicyType; import org.apache.falcon.regression.Entities.ProcessMerlin; import org.apache.falcon.regression.core.bundle.Bundle; import org.apache.falcon.regression.core.helpers.ColoHelper; import org.apache.falcon.regression.core.util.OozieUtil; import org.apache.falcon.regression.core.util.AssertUtil; import org.apache.falcon.regression.core.util.TimeUtil; import org.apache.falcon.regression.core.util.HadoopUtil; import org.apache.falcon.regression.core.util.BundleUtil; import org.apache.falcon.regression.core.util.InstanceUtil; import org.apache.falcon.regression.core.util.OSUtil; import org.apache.falcon.regression.testHelper.BaseTestClass; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.Logger; import org.apache.oozie.client.CoordinatorAction; import org.apache.oozie.client.Job; import org.apache.oozie.client.OozieClient; import org.testng.Assert; import org.testng.TestNGException; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.util.List; /** * Process late data test. */ @Test(groups = { "distributed", "embedded" }) public class ProcessLateRerunTest extends BaseTestClass { private ColoHelper cluster1 = servers.get(0); private OozieClient cluster1OC = serverOC.get(0); private FileSystem cluster1FS = serverFS.get(0); private String baseTestHDFSDir = cleanAndGetTestDir(); private String aggregateWorkflowDir = baseTestHDFSDir + "/aggregator"; private String feedInputPath = baseTestHDFSDir + "/input" + MINUTE_DATE_PATTERN; private String feedOutputPath = baseTestHDFSDir + "/output-data" + MINUTE_DATE_PATTERN; private static final Logger LOGGER = Logger.getLogger(ProcessLateRerunTest.class); @BeforeClass(alwaysRun = true) public void uploadWorkflow() throws Exception { uploadDirToClusters(aggregateWorkflowDir, OSUtil.RESOURCES_OOZIE); } @BeforeMethod(alwaysRun = true) public void setUp() throws Exception { Bundle bundle = BundleUtil.readLateDataBundle(); bundles[0] = new Bundle(bundle, servers.get(0)); bundles[0].generateUniqueBundle(this); bundles[0].setProcessWorkflow(aggregateWorkflowDir); bundles[0].setInputFeedDataPath(feedInputPath); bundles[0].setOutputFeedLocationData(feedOutputPath); } @AfterMethod(alwaysRun = true) public void tearDown() { removeTestClassEntities(); } /** * Test demonstrates rerunning process for late arrival of data. * Initially there is no input data and empty folders are processed. * It checks the number of rerun attempts once late data has been added * ensuring that late rerun happened. */ @Test(enabled = true) public void testProcessLateRerunOnEmptyFolder() throws Exception { String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 30); LOGGER.info("Time range between : " + startTime + " and " + endTime); bundles[0].setProcessValidity(startTime, endTime); bundles[0].setProcessPeriodicity(10, Frequency.TimeUnit.minutes); bundles[0].setOutputFeedPeriodicity(10, Frequency.TimeUnit.minutes); bundles[0].setProcessConcurrency(2); String inputName = bundles[0].getProcessObject().getFirstInputName(); bundles[0].setProcessLatePolicy(getLateData(2, "minutes", "periodic", inputName, aggregateWorkflowDir)); bundles[0].submitAndScheduleProcess(); AssertUtil.checkStatus(cluster1OC, EntityType.PROCESS, bundles[0], Job.Status.RUNNING); TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(cluster1OC, bundles[0].getProcessData(), 0); getAndCreateDependencies(cluster1, bundles[0], cluster1OC, cluster1FS, false, 1); int sleepMins = 6; for(int i=0; i < sleepMins; i++) { LOGGER.info("Waiting..."); TimeUtil.sleepSeconds(60); } InstanceUtil.waitTillInstanceReachState(cluster1OC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); List<String> bundleList = OozieUtil.getBundles(cluster1.getFeedHelper().getOozieClient(), bundles[0].getProcessName(), EntityType.PROCESS); String bundleID = bundleList.get(0); OozieUtil.validateRetryAttempts(cluster1OC, bundleID, EntityType.PROCESS, 1); } /** * Test demonstrates rerunning process for late arrival of data. * Initially there is some data which is processed. It checks the number of rerun attempts * once further more data has been added ensuring that late rerun happened. */ @Test(enabled = true) public void testProcessLateRerunWithData() throws Exception { String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 30); LOGGER.info("Time range between : " + startTime + " and " + endTime); bundles[0].setProcessValidity(startTime, endTime); bundles[0].setProcessPeriodicity(5, Frequency.TimeUnit.minutes); bundles[0].setOutputFeedPeriodicity(5, Frequency.TimeUnit.minutes); bundles[0].setProcessConcurrency(2); String inputName = bundles[0].getProcessObject().getFirstInputName(); bundles[0].setProcessLatePolicy(getLateData(4, "minutes", "periodic", inputName, aggregateWorkflowDir)); bundles[0].submitAndScheduleProcess(); AssertUtil.checkStatus(cluster1OC, EntityType.PROCESS, bundles[0], Job.Status.RUNNING); TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(cluster1OC, bundles[0].getProcessData(), 0); getAndCreateDependencies(cluster1, bundles[0], cluster1OC, cluster1FS, true, 1); int sleepMins = 6; for(int i=0; i < sleepMins; i++) { LOGGER.info("Waiting..."); TimeUtil.sleepSeconds(60); } InstanceUtil.waitTillInstanceReachState(cluster1OC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); List<String> bundleList = OozieUtil.getBundles(cluster1.getFeedHelper().getOozieClient(), bundles[0].getProcessName(), EntityType.PROCESS); String bundleID = bundleList.get(0); OozieUtil.validateRetryAttempts(cluster1OC, bundleID, EntityType.PROCESS, 1); } /** * Test demonstrates rerunning process for late arrival of data for multiple input folders. * It checks the number of rerun attempts once further more data has been added ensuring that late rerun happened. */ @Test(enabled = true) public void testProcessLateRerunWithMultipleFolders() throws Exception { String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 30); String startInstance = "now(0,-5)"; String endInstance = "now(0,0)"; LOGGER.info("Time range between : " + startTime + " and " + endTime); bundles[0].setProcessValidity(startTime, endTime); bundles[0].setProcessPeriodicity(10, Frequency.TimeUnit.minutes); bundles[0].setOutputFeedPeriodicity(10, Frequency.TimeUnit.minutes); String inputName = bundles[0].getProcessObject().getFirstInputName(); bundles[0].setProcessLatePolicy(getLateData(4, "minutes", "periodic", inputName, aggregateWorkflowDir)); bundles[0].setProcessConcurrency(2); // Increase the window of input for process bundles[0].setDatasetInstances(startInstance, endInstance); bundles[0].submitAndScheduleProcess(); AssertUtil.checkStatus(cluster1OC, EntityType.PROCESS, bundles[0], Job.Status.RUNNING); TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(cluster1OC, bundles[0].getProcessData(), 0); getAndCreateDependencies(cluster1, bundles[0], cluster1OC, cluster1FS, false, 3); int sleepMins = 6; for(int i=0; i < sleepMins; i++) { LOGGER.info("Waiting..."); TimeUtil.sleepSeconds(60); } InstanceUtil.waitTillInstanceReachState(cluster1OC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); List<String> bundleList = OozieUtil.getBundles(cluster1.getFeedHelper().getOozieClient(), bundles[0].getProcessName(), EntityType.PROCESS); String bundleID = bundleList.get(0); OozieUtil.validateRetryAttempts(cluster1OC, bundleID, EntityType.PROCESS, 1); } /** * Test demonstrates rerunning process for late arrival of data for gate folders. * Late rerun will not work on gate folder, so no retry attempt on the appended data. */ @Test(enabled = true) public void testProcessLateRerunWithGate() throws Exception { String startTime = TimeUtil.getTimeWrtSystemTime(0); String endTime = TimeUtil.addMinsToTime(startTime, 30); String startInstance = "now(0,-5)"; String endInstance = "now(0,0)"; LOGGER.info("Time range between : " + startTime + " and " + endTime); bundles[0].setProcessValidity(startTime, endTime); bundles[0].setProcessPeriodicity(10, Frequency.TimeUnit.minutes); bundles[0].setOutputFeedPeriodicity(10, Frequency.TimeUnit.minutes); bundles[0].setProcessConcurrency(2); // Increase the window of input for process bundles[0].setDatasetInstances(startInstance, endInstance); ProcessMerlin process = bundles[0].getProcessObject(); String inputName = process.getFirstInputName(); Input tempFeed = process.getInputs().getInputs().get(0); Input gateInput = new Input(); gateInput.setName("Gate"); gateInput.setFeed(tempFeed.getFeed()); gateInput.setEnd("now(0,1)"); gateInput.setStart("now(0,1)"); process.getInputs().getInputs().add(gateInput); bundles[0].setProcessData(process.toString()); bundles[0].setProcessLatePolicy(getLateData(4, "minutes", "periodic", inputName, aggregateWorkflowDir)); bundles[0].submitAndScheduleProcess(); AssertUtil.checkStatus(cluster1OC, EntityType.PROCESS, bundles[0], Job.Status.RUNNING); TimeUtil.sleepSeconds(10); InstanceUtil.waitTillInstancesAreCreated(cluster1OC, bundles[0].getProcessData(), 0); getAndCreateDependencies(cluster1, bundles[0], cluster1OC, cluster1FS, false, 7); int sleepMins = 6; for(int i=0; i < sleepMins; i++) { LOGGER.info("Waiting..."); TimeUtil.sleepSeconds(60); } InstanceUtil.waitTillInstanceReachState(cluster1OC, bundles[0].getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); List<String> bundleList = OozieUtil.getBundles(cluster1.getFeedHelper().getOozieClient(), bundles[0].getProcessName(), EntityType.PROCESS); String bundleID = bundleList.get(0); OozieUtil.validateRetryAttempts(cluster1OC, bundleID, EntityType.PROCESS, 0); } /* dataFlag - denotes whether process should run initially on empty folders or folders containing data dataFolder - denotes the folder where you want to upload data for late rerun */ private void getAndCreateDependencies(ColoHelper prismHelper, Bundle bundle, OozieClient oozieClient, FileSystem clusterFS, boolean dataFlag, int dataFolder) { try { List<String> bundles = null; for (int i = 0; i < 10; ++i) { bundles = OozieUtil.getBundles(prismHelper.getFeedHelper().getOozieClient(), bundle.getProcessName(), EntityType.PROCESS); if (bundles.size() > 0) { break; } TimeUtil.sleepSeconds(30); } Assert.assertTrue(bundles != null && bundles.size() > 0, "Bundle job not created."); String bundleID = bundles.get(0); LOGGER.info("bundle id: " + bundleID); List<String> missingDependencies = OozieUtil.getMissingDependencies(oozieClient, bundleID); for (int i = 0; i < 10 && missingDependencies == null; ++i) { TimeUtil.sleepSeconds(30); missingDependencies = OozieUtil.getMissingDependencies(oozieClient, bundleID); } Assert.assertNotNull(missingDependencies, "Missing dependencies not found."); //print missing dependencies for (String dependency : missingDependencies) { LOGGER.info("dependency from job: " + dependency); } //create missing dependencies LOGGER.info("Creating missing dependencies..."); OozieUtil.createMissingDependencies(prismHelper, EntityType.PROCESS, bundle.getProcessName(), 0, 0); //Adding data to empty folders depending on dataFlag if (dataFlag) { int tempCount = 1; for (String location : missingDependencies) { if (tempCount==1) { LOGGER.info("Transferring data to : " + location); HadoopUtil.copyDataToFolder(clusterFS, location, OSUtil.concat(OSUtil.NORMAL_INPUT, "dataFile.xml")); tempCount++; } } } //Process succeeding on empty folders LOGGER.info("Waiting for process to succeed..."); InstanceUtil.waitTillInstanceReachState(oozieClient, bundle.getProcessName(), 1, CoordinatorAction.Status.SUCCEEDED, EntityType.PROCESS); TimeUtil.sleepSeconds(30); //Adding data to check late rerun int tempCounter = 1; for (String dependency : missingDependencies) { if (tempCounter==dataFolder) { LOGGER.info("Transferring late data to : " + dependency); HadoopUtil.copyDataToFolder(clusterFS, dependency, OSUtil.concat(OSUtil.NORMAL_INPUT, "dataFile.properties")); } tempCounter++; } } catch (Exception e) { e.printStackTrace(); throw new TestNGException(e); } } private static LateProcess getLateData(int delay, String delayUnits, String retryType, String inputData, String workflowDir) { LateInput lateInput = new LateInput(); lateInput.setInput(inputData); lateInput.setWorkflowPath(workflowDir); LateProcess lateProcess = new LateProcess(); lateProcess.setDelay(new Frequency(delayUnits + "(" + delay + ")")); lateProcess.setPolicy(PolicyType.fromValue(retryType)); lateProcess.getLateInputs().add(lateInput); return lateProcess; } }