/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.qjournal.client; import java.io.IOException; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.URL; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition; import org.apache.hadoop.hdfs.qjournal.protocol.JournalConfigKeys; import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException; import org.apache.hadoop.hdfs.qjournal.protocol.JournalRequestInfo; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetImageManifestResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetStorageStateProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto; import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo; import org.apache.hadoop.hdfs.qjournal.server.GetJournalEditServlet; import org.apache.hadoop.hdfs.qjournal.server.GetJournalImageServlet; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.server.protocol.RemoteImageManifest; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.ipc.RPC; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.net.InetAddresses; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.common.util.concurrent.UncaughtExceptionHandlers; /** * Channel to a remote JournalNode using Hadoop IPC. * All of the calls are run on a separate thread, and return * {@link ListenableFuture} instances to wait for their result. * This allows calls to be bound together using the {@link QuorumCall} * class. */ @InterfaceAudience.Private public class IPCLoggerChannel implements AsyncLogger { public static final Log LOG = LogFactory.getLog(IPCLoggerChannel.class); private final Configuration conf; protected final InetSocketAddress addr; private QJournalProtocol proxy; private final ListeningExecutorService executor; private long ipcSerial = 0; private long epoch = -1; private long committedTxId = HdfsConstants.INVALID_TXID; private final String journalId; private final byte[] journalIdBytes; private final NamespaceInfo nsInfo; private int httpPort = -1; private final IPCLoggerChannelMetrics metrics; /** * The number of bytes of edits data still in the queue. */ private int queuedEditsSizeBytes = 0; /** * The highest txid that has been successfully logged on the remote JN. */ private long highestAckedTxId = 0; /** * Nanotime of the last time we successfully journaled some edits * to the remote node. */ private long lastAckNanos = 0; /** * Nanotime of the last time that committedTxId was update. Used * to calculate the lag in terms of time, rather than just a number * of txns. */ private long lastCommitNanos = 0; /** * The maximum number of bytes that can be pending in the queue. * This keeps the writer from hitting OOME if one of the loggers * starts responding really slowly. Eventually, the queue * overflows and it starts to treat the logger as having errored. */ private final int queueSizeLimitBytes; /** * If this logger misses some edits, or restarts in the middle of * a segment, the writer won't be able to write any more edits until * the beginning of the next segment. Upon detecting this situation, * the writer sets this flag to true to avoid sending useless RPCs. */ private boolean outOfSync = false; /** * Stopwatch which starts counting on each heartbeat that is sent */ private Stopwatch lastHeartbeatStopwatch = new Stopwatch(); private static final long HEARTBEAT_INTERVAL_MILLIS = 1000; private static final long WARN_JOURNAL_MILLIS_THRESHOLD = 1000; static final Factory FACTORY = new AsyncLogger.Factory() { @Override public AsyncLogger createLogger(Configuration conf, NamespaceInfo nsInfo, String journalId, InetSocketAddress addr) { return new IPCLoggerChannel(conf, nsInfo, journalId, addr); } }; public IPCLoggerChannel(Configuration conf, NamespaceInfo nsInfo, String journalId, InetSocketAddress addr) { this.conf = conf; this.nsInfo = nsInfo; this.journalId = journalId; this.journalIdBytes = QuorumJournalManager .journalIdStringToBytes(journalId); this.addr = addr; this.queueSizeLimitBytes = 1024 * 1024 * conf.getInt( JournalConfigKeys.DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY, JournalConfigKeys.DFS_QJOURNAL_QUEUE_SIZE_LIMIT_DEFAULT); executor = MoreExecutors.listeningDecorator( createExecutor()); metrics = IPCLoggerChannelMetrics.create(this); } @Override public synchronized void setEpoch(long epoch) { this.epoch = epoch; } @Override public synchronized void setCommittedTxId(long txid, boolean force) { if (!force && txid < committedTxId) { throw new IllegalArgumentException( "Trying to move committed txid backwards in client, old:" + committedTxId + " new: " + txid); } this.committedTxId = txid; if (force && this.highestAckedTxId > txid) { this.highestAckedTxId = txid; } if (!force) { this.lastCommitNanos = System.nanoTime(); metrics.setCurrentLagTransactions(getLagTxns()); metrics.setCurrentLagTimeMicros(getLagTimeMicros()); } } @Override public void close() { // No more tasks may be submitted after this point. executor.shutdown(); if (proxy != null) { // TODO: this can hang for quite some time if the client // is currently in the middle of a call to a downed JN. // We should instead do this asynchronously, and just stop // making any more calls after this point (eg clear the queue) try { RPC.stopProxy(proxy); } catch (Exception e) { LOG.error("error: ", e); } } } protected QJournalProtocol getProxy() throws IOException { if (proxy != null) return proxy; proxy = createProxy(); return proxy; } protected QJournalProtocol createProxy() throws IOException { final Configuration confCopy = new Configuration(conf); // Need to set NODELAY or else batches larger than MTU can trigger // 40ms nagling delays. confCopy.setBoolean("ipc.client.tcpnodelay", true); long connectionTimeout = conf.getLong( JournalConfigKeys.DFS_QJOURNAL_CONNECT_TIMEOUT_KEY, Long.MAX_VALUE); return (QJournalProtocol) RPC.waitForProxy(QJournalProtocol.class, QJournalProtocol.versionID, addr, confCopy, connectionTimeout, 0); } /** * Separated out for easy overriding in tests. */ @VisibleForTesting protected ExecutorService createExecutor() { return Executors.newSingleThreadExecutor( new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("Logger channel to " + addr) .setUncaughtExceptionHandler( UncaughtExceptionHandlers.systemExit()) .build()); } @Override public URL buildURLToFetchLogs(long segmentTxId, long position) { Preconditions.checkArgument(segmentTxId > -1, "Invalid segment: %s", segmentTxId); Preconditions.checkState(httpPort != -1, "HTTP port not set yet"); try { String path = GetJournalEditServlet.buildPath( journalId, segmentTxId, nsInfo, position); return new URL("http", addr.getAddress().getHostAddress(), httpPort, path.toString()); } catch (MalformedURLException e) { // should never get here. throw new IllegalStateException(e); } } private synchronized RequestInfo createReqInfo() { checkEpoch(); return new RequestInfo(journalIdBytes, epoch, ipcSerial++, committedTxId); } private synchronized JournalRequestInfo createJournalReqInfo( long segmentTxId, long firstTxdId, int numTxns, byte[] records) { checkEpoch(); return new JournalRequestInfo(journalIdBytes, epoch, ipcSerial++, committedTxId, segmentTxId, firstTxdId, numTxns, records); } private void checkEpoch() { if (epoch <= 0) { throw new IllegalStateException("bad epoch: " + epoch); } } @VisibleForTesting synchronized long getNextIpcSerial() { return ipcSerial; } public synchronized int getQueuedEditsSize() { return queuedEditsSizeBytes; } public InetSocketAddress getRemoteAddress() { return addr; } /** * @return true if the server has gotten out of sync from the client, * and thus a log roll is required for this logger to successfully start * logging more edits. */ public synchronized boolean isOutOfSync() { return outOfSync; } @VisibleForTesting void waitForAllPendingCalls() throws InterruptedException { try { executor.submit(new Runnable() { @Override public void run() { } }).get(); } catch (ExecutionException e) { // This can't happen! throw new IllegalStateException(e); } } @Override public ListenableFuture<Boolean> isJournalFormatted() { return executor.submit(new Callable<Boolean>() { @Override public Boolean call() throws IOException { return getProxy().isJournalFormatted(journalIdBytes); } }); } @Override public ListenableFuture<GetJournalStateResponseProto> getJournalState() { return executor.submit(new Callable<GetJournalStateResponseProto>() { @Override public GetJournalStateResponseProto call() throws IOException { GetJournalStateResponseProto ret = getProxy().getJournalState(journalIdBytes); httpPort = ret.getHttpPort(); return ret; } }); } @Override public ListenableFuture<NewEpochResponseProto> newEpoch( final long epoch) { return executor.submit(new Callable<NewEpochResponseProto>() { @Override public NewEpochResponseProto call() throws IOException { return getProxy().newEpoch(journalIdBytes, nsInfo, epoch); } }); } @Override public ListenableFuture<Void> sendEdits( final long segmentTxId, final long firstTxnId, final int numTxns, final byte[] data) { try { reserveQueueSpace(data.length); } catch (LoggerTooFarBehindException e) { // log the exception, because immediateFailedFuture will log nothing here. LOG.warn("Edits queue overflows when trying to write txns " + firstTxnId + "-" + (firstTxnId + numTxns - 1), e); return Futures.immediateFailedFuture(e); } // When this batch is acked, we use its submission time in order // to calculate how far we are lagging. final long submitNanos = System.nanoTime(); ListenableFuture<Void> ret = null; try { ret = executor.submit(new Callable<Void>() { @Override public Void call() throws IOException { throwIfOutOfSync(); boolean isDebugEnabled = LOG.isDebugEnabled(); long rpcSendTimeNanos = System.nanoTime(); try { if (isDebugEnabled) { LOG.debug("Prepare writing to remote journal " + IPCLoggerChannel.this + ". " + "write txns " + firstTxnId + "-" + (firstTxnId + numTxns - 1)); } getProxy().journal(createJournalReqInfo( segmentTxId, firstTxnId, numTxns, data)); if (isDebugEnabled) { LOG.debug("Finish writing to remote journal " + IPCLoggerChannel.this + ". " + "write txns " + firstTxnId + "-" + (firstTxnId + numTxns - 1) + ". Spent time (micro time): " + ((System.nanoTime() - rpcSendTimeNanos) / 1000)); } } catch (IOException e) { QuorumJournalManager.LOG.warn( "Remote journal " + IPCLoggerChannel.this + " failed to " + "write txns " + firstTxnId + "-" + (firstTxnId + numTxns - 1) + ". Will try to write to this JN again after the next " + "log roll.", e); synchronized (IPCLoggerChannel.this) { outOfSync = true; metrics.setOutOfSync(outOfSync); } throw e; } finally { long rpcTime = DFSUtil.getElapsedTimeMicroSeconds(rpcSendTimeNanos); long endToEndTime = DFSUtil.getElapsedTimeMicroSeconds(submitNanos); metrics.addWriteEndToEndLatency(endToEndTime); metrics.addWriteRpcLatency(rpcTime); if (rpcTime / 1000 > WARN_JOURNAL_MILLIS_THRESHOLD) { QuorumJournalManager.LOG.warn( "Took " + (rpcTime / 1000) + "ms to send a batch of " + numTxns + " edits (firstTxnId " + firstTxnId + ", " + data.length + " bytes) to " + "remote journal " + IPCLoggerChannel.this); } } synchronized (IPCLoggerChannel.this) { highestAckedTxId = firstTxnId + numTxns - 1; lastAckNanos = submitNanos; metrics.setCurrentLagTransactions(getLagTxns()); metrics.setCurrentLagTimeMicros(getLagTimeMicros()); } return null; } }); } finally { if (ret == null) { // it didn't successfully get submitted, // so adjust the queue size back down. unreserveQueueSpace(data.length); } else { // It was submitted to the queue, so adjust the length // once the call completes, regardless of whether it // succeeds or fails. Futures.addCallback(ret, new FutureCallback<Void>() { @Override public void onFailure(Throwable t) { unreserveQueueSpace(data.length); } @Override public void onSuccess(Void t) { unreserveQueueSpace(data.length); } }); } } return ret; } private void throwIfOutOfSync() throws JournalOutOfSyncException, IOException { if (isOutOfSync()) { // Even if we're out of sync, it's useful to send an RPC // to the remote node in order to update its lag metrics, etc. heartbeatIfNecessary(); throw new JournalOutOfSyncException( "Journal disabled until next roll"); } metrics.setOutOfSync(outOfSync); } /** * When we've entered an out-of-sync state, it's still useful to periodically * send an empty RPC to the server, such that it has the up to date * committedTxId. This acts as a sanity check during recovery, and also allows * that node's metrics to be up-to-date about its lag. * * In the future, this method may also be used in order to check that the * current node is still the current writer, even if no edits are being * written. */ private void heartbeatIfNecessary() throws IOException { if (lastHeartbeatStopwatch.elapsedMillis() > HEARTBEAT_INTERVAL_MILLIS || !lastHeartbeatStopwatch.isRunning()) { try { getProxy().heartbeat(createReqInfo()); } finally { // Don't send heartbeats more often than the configured interval, // even if they fail. lastHeartbeatStopwatch.reset().start(); } } } private synchronized void reserveQueueSpace(int size) throws LoggerTooFarBehindException { Preconditions.checkArgument(size >= 0); if (queuedEditsSizeBytes + size > queueSizeLimitBytes && queuedEditsSizeBytes > 0) { throw new LoggerTooFarBehindException("The edits pending queues to " + IPCLoggerChannel.this + " overflows. Required size: " + (queuedEditsSizeBytes + size) + ", limited size: " + queueSizeLimitBytes); } queuedEditsSizeBytes += size; metrics.setCurrentQueuedEditsSizeBytes(queuedEditsSizeBytes); } private synchronized void unreserveQueueSpace(int size) { Preconditions.checkArgument(size >= 0); queuedEditsSizeBytes -= size; metrics.setCurrentQueuedEditsSizeBytes(queuedEditsSizeBytes); } @Override public ListenableFuture<Void> transitionImage(final NamespaceInfo nsInfo, final Transition transition, final StartupOption startOpt) { return executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { getProxy().transitionImage(journalIdBytes, nsInfo, transition, startOpt); return null; } }); } @Override public ListenableFuture<Void> transitionJournal(final NamespaceInfo nsInfo, final Transition transition, final StartupOption startOpt) { return executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { getProxy().transitionJournal(journalIdBytes, nsInfo, transition, startOpt); return null; } }); } @Override public ListenableFuture<Void> startLogSegment(final long txid) { return executor.submit(new Callable<Void>() { @Override public Void call() throws IOException { getProxy().startLogSegment(createReqInfo(), txid); synchronized (IPCLoggerChannel.this) { if (outOfSync) { outOfSync = false; QuorumJournalManager.LOG.info( "Restarting previously-stopped writes to " + IPCLoggerChannel.this + " in segment starting at txid " + txid); } metrics.setOutOfSync(outOfSync); } return null; } }); } @Override public ListenableFuture<Void> finalizeLogSegment( final long startTxId, final long endTxId) { return executor.submit(new Callable<Void>() { @Override public Void call() throws IOException { throwIfOutOfSync(); getProxy().finalizeLogSegment(createReqInfo(), startTxId, endTxId); return null; } }); } @Override public ListenableFuture<Void> purgeLogsOlderThan(final long minTxIdToKeep) { return executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { getProxy().purgeLogsOlderThan(createReqInfo(), minTxIdToKeep); return null; } }); } @Override public ListenableFuture<RemoteEditLogManifest> getEditLogManifest( final long fromTxnId) { return executor.submit(new Callable<RemoteEditLogManifest>() { @Override public RemoteEditLogManifest call() throws IOException { GetEditLogManifestResponseProto ret = getProxy().getEditLogManifest( journalIdBytes, fromTxnId); // Update the http port, since we need this to build URLs to any of the // returned logs. httpPort = ret.getHttpPort(); return new RemoteEditLogManifest(ret.getLogs()); } }); } @Override public ListenableFuture<PrepareRecoveryResponseProto> prepareRecovery( final long segmentTxId) { return executor.submit(new Callable<PrepareRecoveryResponseProto>() { @Override public PrepareRecoveryResponseProto call() throws IOException { if (httpPort < 0) { // If the HTTP port hasn't been set yet, force an RPC call so we know // what the HTTP port should be. httpPort = getProxy().getJournalState(journalIdBytes).getHttpPort(); } return getProxy().prepareRecovery(createReqInfo(), segmentTxId); } }); } @Override public ListenableFuture<Void> acceptRecovery( final SegmentStateProto log, final String url) { return executor.submit(new Callable<Void>() { @Override public Void call() throws IOException { getProxy().acceptRecovery(createReqInfo(), log, url); return null; } }); } @Override public String toString() { return InetAddresses.toAddrString(addr.getAddress()) + ':' + addr.getPort(); } @Override public String toHTMLString() { // (F) logger is disabled // (A) logger is active return InetAddresses.toAddrString(addr.getAddress()) + ':' + addr.getPort() + (isOutOfSync() ? "<font color=red><b>(F)</b></font>" : "<font color=green><b>(A)</b></font>"); } @Override public synchronized void appendHtmlReport(StringBuilder sb) { sb.append("Written txid ").append(highestAckedTxId); long behind = getLagTxns(); if (behind > 0) { if (lastAckNanos != 0) { long lagMicros = getLagTimeMicros(); sb.append(" (" + behind + " txns/" + lagMicros + " micros behind)"); } else { sb.append(" (never written"); } } if (outOfSync) { sb.append(" (will re-sync on next segment)"); } } private synchronized long getLagTxns() { return Math.max(committedTxId - highestAckedTxId, 0); } private synchronized long getLagTimeMicros() { return Math.max((lastCommitNanos - lastAckNanos) / 1000, 0); } @Override public ListenableFuture<GetStorageStateProto> analyzeJournalStorage() { return executor.submit(new Callable<GetStorageStateProto>() { @Override public GetStorageStateProto call() throws Exception { return getProxy().analyzeJournalStorage(journalIdBytes); } }); } @Override public ListenableFuture<Void> saveDigestAndRenameCheckpointImage( final long txid, final MD5Hash digest) { return executor.submit(new Callable<Void>() { @Override public Void call() throws IOException { getProxy().saveDigestAndRenameCheckpointImage(createReqInfo(), txid, digest); return null; } }); } @Override public ListenableFuture<RemoteImageManifest> getImageManifest( final long fromTxnId) { return executor.submit(new Callable<RemoteImageManifest>() { @Override public RemoteImageManifest call() throws IOException { GetImageManifestResponseProto ret = getProxy().getImageManifest( journalIdBytes, fromTxnId); return new RemoteImageManifest(ret.getImages()); } }); } /** * Build url to fetch image from the journal node to which this logger channel * is attached. */ @Override public URL buildURLToFetchImage(long txid) { Preconditions.checkArgument(txid >= -1, "Invalid segment: %s", txid); Preconditions.checkState(httpPort != -1, "HTTP port not set yet"); try { // for now we disable throttling for image downloads String path = GetJournalImageServlet.buildPath(journalId, txid, nsInfo, true); return new URL("http", addr.getAddress().getHostAddress(), httpPort, path.toString()); } catch (MalformedURLException e) { // should never get here. throw new IllegalStateException(e); } } @Override public ListenableFuture<GetStorageStateProto> analyzeImageStorage() { return executor.submit(new Callable<GetStorageStateProto>() { @Override public GetStorageStateProto call() throws Exception { return getProxy().analyzeImageStorage(journalIdBytes); } }); } @Override public ListenableFuture<Boolean> isImageFormatted() { return executor.submit(new Callable<Boolean>() { @Override public Boolean call() throws IOException { return getProxy().isImageFormatted(journalIdBytes); } }); } }