package org.apache.hadoop.mapred;
import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.mapred.CoronaSessionInfo.InetSocketAddressWritable;
import com.sun.org.apache.commons.logging.Log;
import com.sun.org.apache.commons.logging.LogFactory;
/**
* Generic caller that tries to reconnect to new job tracker on error in RPC
* call. New job tracker address is obtained from
* @param <T> return type of called function
*/
@SuppressWarnings("deprecation")
public abstract class CoronaJTFallbackCaller<T> {
/** Default wait time between queries to secondary tracker until remote JT is
* completely restarted, in msec */
private static final int SECONDARY_TRACKER_QUERIES_INTERVAL = 10000;
/** Default timeout for connecting to secondary tracker address */
private static final long SECONDARY_TRACKER_CONNECT_TIMEOUT = 30000;
/** Max number of times we can ask for new JT address and get response to back
* off after single crash */
private static final int SECONDARY_TRACKER_MAX_BACKOFF = 600;
/** Logger */
private static final Log LOG = LogFactory
.getLog(CoronaJTFallbackCaller.class);
/** Max number the Fallback caller can connect the new address
* In some cases, when the Fallback caller get the new job tracker address
* and try to connect the new job tracker, it will find the new job tracker get
* lost again. So we need to call reconnectToNewJobTracker () recursively, This
* number limit max number we do recursion.
* */
private static final int CONNECT_MAX_NUMBER = 8;
/**
* Perform the call. Must be overridden by a sub-class.
* @return The generic return value.
* @throws IOException
*/
protected abstract T call() throws IOException;
/**
* Prediticate determining if should try again after getting information, that
* remote JT is during restarting process.
* @param retryNum numbet of performed retry retry (zeroed after call failure)
* @return true if should retry
*/
protected boolean predRetry(int retryNum) {
return retryNum <= SECONDARY_TRACKER_MAX_BACKOFF;
}
/**
* Provides implementation of wait mechanism between quering secondary tracker
* for new remote JT address.
*/
protected void waitRetry() throws InterruptedException {
synchronized (this) {
this.wait(SECONDARY_TRACKER_QUERIES_INTERVAL);
}
}
/**
* Opens client with provided address
* @param address
* @throws IOException
*/
protected abstract void connect(InetSocketAddress address) throws IOException;
/**
* Closes RPC client
*/
protected abstract void shutdown();
/**
* Get current RPC address
* @return current address of RPC clients destination
*/
protected abstract InetSocketAddress getCurrentClientAddress();
/**
* Returns job configuration
* @return job conf
*/
protected abstract JobConf getConf();
/**
* Gets secondary tracker address
* @return secondary fallback address
*/
protected abstract InetSocketAddress getSecondaryTracker();
/**
* When IO Exception happened, call this function to handle it
*/
protected abstract void handleIOException(IOException e) throws IOException;
/**
* Template function to make the call. Throws if can not fallback.
* @return The generic return value.
* @throws IOException
*/
public final T makeCall() throws IOException {
while (true) {
try {
return call();
} catch (ConnectException e) {
// We fall back only after ConnectException
try {
// Fall back to secondary tracker and reconnect to new JT
reconnectToNewJobTracker(0);
} catch (IOException f) {
LOG.error("Fallback process failed with ", f);
// Re-throw original exception
throw e;
}
} catch (IOException e) {
// the subclass of fallback caller should provide
// logic here. We will retry in most cases
handleIOException(e);
}
}
}
/**
* Reconnects to new address obtained from secondary address via
* InterCoronaTrackerProtocol
* @throws IOException
*/
private final void reconnectToNewJobTracker(int connectNum) throws IOException {
if (connectNum >= CONNECT_MAX_NUMBER) {
LOG.error("reconnectToNewJobTracker has reached its max number.");
throw new IOException("reconnectToNewJobTracker has reached its max number.");
}
InetSocketAddress secondaryTracker = getSecondaryTracker();
JobConf conf = getConf();
InetSocketAddress oldAddress = getCurrentClientAddress();
LOG.info("Falling back from " + oldAddress + " to secondary tracker at "
+ secondaryTracker + " with " + connectNum + " try");
if (secondaryTracker == null)
throw new IOException("Secondary address not provided.");
shutdown();
InterCoronaJobTrackerProtocol secondaryClient = RPC.waitForProxy(
InterCoronaJobTrackerProtocol.class,
InterCoronaJobTrackerProtocol.versionID, secondaryTracker, conf,
SECONDARY_TRACKER_CONNECT_TIMEOUT);
// Obtain new address
InetSocketAddressWritable oldAddrWritable = new InetSocketAddressWritable(
oldAddress);
InetSocketAddressWritable newAddress = null;
int retryNum = 0;
do {
newAddress = secondaryClient.getNewJobTrackerAddress(oldAddrWritable);
try {
waitRetry();
} catch (InterruptedException e) {
LOG.error("Fallback interrupted, taking next retry.");
}
++retryNum;
} while (newAddress == null && predRetry(retryNum));
if (newAddress == null || newAddress.getAddress() == null)
throw new IOException("Failed to obtain new job tracker address.");
RPC.stopProxy(secondaryClient);
try {
connect(newAddress.getAddress());
LOG.info("Fallback process successful: " + newAddress.getAddress());
} catch (IOException e) {
LOG.error("Fallback connect to " + newAddress.getAddress() + " failed for ", e);
reconnectToNewJobTracker(++connectNum);
}
}
}