/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.test.util; import junit.framework.Assert; import org.apache.log4j.Level; import eu.stratosphere.client.minicluster.NepheleMiniCluster; import eu.stratosphere.nephele.client.JobClient; import eu.stratosphere.nephele.client.JobExecutionException; import eu.stratosphere.nephele.jobgraph.JobGraph; import eu.stratosphere.util.LogUtils; /** * Base class for integration tests which test whether the system recovers from failed executions. */ public abstract class FailingTestBase extends RecordAPITestBase { public FailingTestBase() { LogUtils.initializeDefaultConsoleLogger(Level.OFF); } /** * Returns the {@link JobGraph} of the failing job. * * @return The JobGraph of the failing job. * @throws Exception */ abstract protected JobGraph getFailingJobGraph() throws Exception; /** * Returns the path to the jar-file of the failing job. * * @return Path to the jar-file of the failing job. */ protected String getFailingJarFilePath() { return null; } /** * Returns the timeout for the execution of both (the failing and the working) job in seconds. * * @return Timeout for the execution of both jobs in seconds. */ abstract protected int getTimeout(); /** * Tests that both jobs, the failing and the working one, are handled correctly. * The first (failing) job must be canceled and the Nephele client must report the failure. * The second (working) job must finish successfully and compute the correct result. * A timeout waits for the successful return for the Nephele client. In case of a deadlock * (or too small value for timeout) the time runs out and this test fails. * */ @Override public void testJob() throws Exception { // pre-submit try { preSubmit(); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); Assert.fail("Pre-submit work caused an error: " + e.getMessage()); } // init submission thread SubmissionThread st = new SubmissionThread(Thread.currentThread(), this.executor, getFailingJobGraph(), getJobGraph()); // start submission thread st.start(); try { // wait for timeout Thread.sleep(getTimeout()*1000); Assert.fail("Failing job and successful job did not fail."); } catch(InterruptedException ie) { // will have happened if all works fine } Exception cte = st.error; if (cte != null) { cte.printStackTrace(); Assert.fail("Task Canceling failed: " + cte.getMessage()); } // post-submit try { postSubmit(); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); Assert.fail("Post-submit work caused an error: " + e.getMessage()); } } /** * Thread for submitting both jobs sequentially to the test cluster. * First, the failing job is submitted. The working job is submitted after the Nephele client returns * from the call of its submitJobAndWait() method. */ private class SubmissionThread extends Thread { // reference to the timeout thread private final Thread timeoutThread; // cluster to submit the job to. private final NepheleMiniCluster executor; // job graph of the failing job (submitted first) private final JobGraph failingJob; // job graph of the working job (submitted after return from failing job) private final JobGraph job; private volatile Exception error; public SubmissionThread(Thread timeoutThread, NepheleMiniCluster executor, JobGraph failingJob, JobGraph job) { this.timeoutThread = timeoutThread; this.executor = executor; this.failingJob = failingJob; this.job = job; } /** * Submits the failing and the working job sequentially to the cluster. * As soon as the second job finishes, the timeout thread is interrupted and this thread closed. */ @Override public void run() { try { // submit failing job JobClient client = this.executor.getJobClient(this.failingJob); client.setConsoleStreamForReporting(AbstractTestBase.getNullPrintStream()); client.submitJobAndWait(); this.error = new Exception("The job did not fail."); } catch(JobExecutionException jee) { // as expected } catch (Exception e) { this.error = e; } try { // submit working job JobClient client = this.executor.getJobClient(this.job); client.setConsoleStreamForReporting(AbstractTestBase.getNullPrintStream()); client.submitJobAndWait(); } catch (Exception e) { this.error = e; } // interrupt timeout thread timeoutThread.interrupt(); } } }