/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.replication.regionserver; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.Service; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.wal.WALEdit; import org.apache.hadoop.hbase.replication.ChainWALEntryFilter; import org.apache.hadoop.hbase.replication.ClusterMarkingEntryFilter; import org.apache.hadoop.hbase.replication.ReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeer; import org.apache.hadoop.hbase.replication.ReplicationPeers; import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; import org.apache.hadoop.hbase.replication.ReplicationQueues; import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter; import org.apache.hadoop.hbase.replication.WALEntryFilter; import org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReaderThread.WALEntryBatch; import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor; import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.wal.WAL.Entry; /** * Class that handles the source of a replication stream. * Currently does not handle more than 1 slave * For each slave cluster it selects a random number of peers * using a replication ratio. For example, if replication ration = 0.1 * and slave cluster has 100 region servers, 10 will be selected. * <p> * A stream is considered down when we cannot contact a region server on the * peer cluster for more than 55 seconds by default. * </p> * */ @InterfaceAudience.Private public class ReplicationSource extends Thread implements ReplicationSourceInterface { private static final Log LOG = LogFactory.getLog(ReplicationSource.class); // Queues of logs to process, entry in format of walGroupId->queue, // each presents a queue for one wal group private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>(); // per group queue size, keep no more than this number of logs in each wal group private int queueSizePerGroup; private ReplicationQueues replicationQueues; private ReplicationPeers replicationPeers; private Configuration conf; private ReplicationQueueInfo replicationQueueInfo; // id of the peer cluster this source replicates to private String peerId; String actualPeerId; // The manager of all sources to which we ping back our progress private ReplicationSourceManager manager; // Should we stop everything? private Stoppable stopper; // How long should we sleep for each retry private long sleepForRetries; private FileSystem fs; // id of this cluster private UUID clusterId; // id of the other cluster private UUID peerClusterId; // total number of edits we replicated private AtomicLong totalReplicatedEdits = new AtomicLong(0); // total number of edits we replicated private AtomicLong totalReplicatedOperations = new AtomicLong(0); // The znode we currently play with private String peerClusterZnode; // Maximum number of retries before taking bold actions private int maxRetriesMultiplier; // Indicates if this particular source is running private volatile boolean sourceRunning = false; // Metrics for this source private MetricsSource metrics; //WARN threshold for the number of queued logs, defaults to 2 private int logQueueWarnThreshold; // ReplicationEndpoint which will handle the actual replication private ReplicationEndpoint replicationEndpoint; // A filter (or a chain of filters) for the WAL entries. private WALEntryFilter walEntryFilter; // throttler private ReplicationThrottler throttler; private long defaultBandwidth; private long currentBandwidth; private ConcurrentHashMap<String, ReplicationSourceShipperThread> workerThreads = new ConcurrentHashMap<>(); private AtomicLong totalBufferUsed; /** * Instantiation method used by region servers * * @param conf configuration to use * @param fs file system to use * @param manager replication manager to ping to * @param stopper the atomic boolean to use to stop the regionserver * @param peerClusterZnode the name of our znode * @param clusterId unique UUID for the cluster * @param replicationEndpoint the replication endpoint implementation * @param metrics metrics for replication source * @throws IOException */ @Override public void init(final Configuration conf, final FileSystem fs, final ReplicationSourceManager manager, final ReplicationQueues replicationQueues, final ReplicationPeers replicationPeers, final Stoppable stopper, final String peerClusterZnode, final UUID clusterId, ReplicationEndpoint replicationEndpoint, final MetricsSource metrics) throws IOException { this.stopper = stopper; this.conf = HBaseConfiguration.create(conf); decorateConf(); this.sleepForRetries = this.conf.getLong("replication.source.sleepforretries", 1000); // 1 second this.maxRetriesMultiplier = this.conf.getInt("replication.source.maxretriesmultiplier", 300); // 5 minutes @ 1 sec per this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32); this.replicationQueues = replicationQueues; this.replicationPeers = replicationPeers; this.manager = manager; this.fs = fs; this.metrics = metrics; this.clusterId = clusterId; this.peerClusterZnode = peerClusterZnode; this.replicationQueueInfo = new ReplicationQueueInfo(peerClusterZnode); // ReplicationQueueInfo parses the peerId out of the znode for us this.peerId = this.replicationQueueInfo.getPeerId(); ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId); this.actualPeerId = replicationQueueInfo.getPeerId(); this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2); this.replicationEndpoint = replicationEndpoint; defaultBandwidth = this.conf.getLong("replication.source.per.peer.node.bandwidth", 0); currentBandwidth = getCurrentBandwidth(); this.throttler = new ReplicationThrottler((double) currentBandwidth / 10.0); this.totalBufferUsed = manager.getTotalBufferUsed(); LOG.info("peerClusterZnode=" + peerClusterZnode + ", ReplicationSource : " + peerId + ", currentBandwidth=" + this.currentBandwidth); } private void decorateConf() { String replicationCodec = this.conf.get(HConstants.REPLICATION_CODEC_CONF_KEY); if (StringUtils.isNotEmpty(replicationCodec)) { this.conf.set(HConstants.RPC_CODEC_CONF_KEY, replicationCodec); } } @Override public void enqueueLog(Path log) { String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(log.getName()); PriorityBlockingQueue<Path> queue = queues.get(logPrefix); if (queue == null) { queue = new PriorityBlockingQueue<>(queueSizePerGroup, new LogsComparator()); queues.put(logPrefix, queue); if (this.sourceRunning) { // new wal group observed after source startup, start a new worker thread to track it // notice: it's possible that log enqueued when this.running is set but worker thread // still not launched, so it's necessary to check workerThreads before start the worker final ReplicationSourceShipperThread worker = new ReplicationSourceShipperThread(logPrefix, queue, replicationQueueInfo, this); ReplicationSourceShipperThread extant = workerThreads.putIfAbsent(logPrefix, worker); if (extant != null) { LOG.debug("Someone has beat us to start a worker thread for wal group " + logPrefix); } else { LOG.debug("Starting up worker for wal group " + logPrefix); worker.startup(); } } } queue.put(log); this.metrics.incrSizeOfLogQueue(); // This will log a warning for each new log that gets created above the warn threshold int queueSize = queue.size(); if (queueSize > this.logQueueWarnThreshold) { LOG.warn("WAL group " + logPrefix + " queue size: " + queueSize + " exceeds value of replication.source.log.queue.warn: " + logQueueWarnThreshold); } } @Override public void addHFileRefs(TableName tableName, byte[] family, List<Pair<Path, Path>> pairs) throws ReplicationException { String peerId = peerClusterZnode; if (peerId.contains("-")) { // peerClusterZnode will be in the form peerId + "-" + rsZNode. // A peerId will not have "-" in its name, see HBASE-11394 peerId = peerClusterZnode.split("-")[0]; } Map<TableName, List<String>> tableCFMap = replicationPeers.getConnectedPeer(peerId).getTableCFs(); if (tableCFMap != null) { List<String> tableCfs = tableCFMap.get(tableName); if (tableCFMap.containsKey(tableName) && (tableCfs == null || tableCfs.contains(Bytes.toString(family)))) { this.replicationQueues.addHFileRefs(peerId, pairs); metrics.incrSizeOfHFileRefsQueue(pairs.size()); } else { LOG.debug("HFiles will not be replicated belonging to the table " + tableName + " family " + Bytes.toString(family) + " to peer id " + peerId); } } else { // user has explicitly not defined any table cfs for replication, means replicate all the // data this.replicationQueues.addHFileRefs(peerId, pairs); metrics.incrSizeOfHFileRefsQueue(pairs.size()); } } private void uninitialize() { LOG.debug("Source exiting " + this.peerId); metrics.clear(); if (replicationEndpoint.state() == Service.State.STARTING || replicationEndpoint.state() == Service.State.RUNNING) { replicationEndpoint.stopAndWait(); } } @Override public void run() { // mark we are running now this.sourceRunning = true; try { // start the endpoint, connect to the cluster Service.State state = replicationEndpoint.start().get(); if (state != Service.State.RUNNING) { LOG.warn("ReplicationEndpoint was not started. Exiting"); uninitialize(); return; } } catch (Exception ex) { LOG.warn("Error starting ReplicationEndpoint, exiting", ex); throw new RuntimeException(ex); } // get the WALEntryFilter from ReplicationEndpoint and add it to default filters ArrayList<WALEntryFilter> filters = Lists.newArrayList( (WALEntryFilter)new SystemTableWALEntryFilter()); WALEntryFilter filterFromEndpoint = this.replicationEndpoint.getWALEntryfilter(); if (filterFromEndpoint != null) { filters.add(filterFromEndpoint); } this.walEntryFilter = new ChainWALEntryFilter(filters); int sleepMultiplier = 1; // delay this until we are in an asynchronous thread while (this.isSourceActive() && this.peerClusterId == null) { this.peerClusterId = replicationEndpoint.getPeerUUID(); if (this.isSourceActive() && this.peerClusterId == null) { if (sleepForRetries("Cannot contact the peer's zk ensemble", sleepMultiplier)) { sleepMultiplier++; } } } // In rare case, zookeeper setting may be messed up. That leads to the incorrect // peerClusterId value, which is the same as the source clusterId if (clusterId.equals(peerClusterId) && !replicationEndpoint.canReplicateToSameCluster()) { this.terminate("ClusterId " + clusterId + " is replicating to itself: peerClusterId " + peerClusterId + " which is not allowed by ReplicationEndpoint:" + replicationEndpoint.getClass().getName(), null, false); this.manager.closeQueue(this); return; } LOG.info("Replicating " + clusterId + " -> " + peerClusterId); // start workers for (Map.Entry<String, PriorityBlockingQueue<Path>> entry : queues.entrySet()) { String walGroupId = entry.getKey(); PriorityBlockingQueue<Path> queue = entry.getValue(); final ReplicationSourceShipperThread worker = new ReplicationSourceShipperThread(walGroupId, queue, replicationQueueInfo, this); ReplicationSourceShipperThread extant = workerThreads.putIfAbsent(walGroupId, worker); if (extant != null) { LOG.debug("Someone has beat us to start a worker thread for wal group " + walGroupId); } else { LOG.debug("Starting up worker for wal group " + walGroupId); worker.startup(); } } } /** * Do the sleeping logic * @param msg Why we sleep * @param sleepMultiplier by how many times the default sleeping time is augmented * @return True if <code>sleepMultiplier</code> is < <code>maxRetriesMultiplier</code> */ protected boolean sleepForRetries(String msg, int sleepMultiplier) { try { if (LOG.isTraceEnabled()) { LOG.trace(msg + ", sleeping " + sleepForRetries + " times " + sleepMultiplier); } Thread.sleep(this.sleepForRetries * sleepMultiplier); } catch (InterruptedException e) { LOG.debug("Interrupted while sleeping between retries"); Thread.currentThread().interrupt(); } return sleepMultiplier < maxRetriesMultiplier; } /** * check whether the peer is enabled or not * * @return true if the peer is enabled, otherwise false */ protected boolean isPeerEnabled() { return this.replicationPeers.getStatusOfPeer(this.peerId); } @Override public void startup() { String n = Thread.currentThread().getName(); Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() { @Override public void uncaughtException(final Thread t, final Throwable e) { LOG.error("Unexpected exception in ReplicationSource", e); } }; Threads .setDaemonThreadRunning(this, n + ".replicationSource," + this.peerClusterZnode, handler); } @Override public void terminate(String reason) { terminate(reason, null); } @Override public void terminate(String reason, Exception cause) { terminate(reason, cause, true); } public void terminate(String reason, Exception cause, boolean join) { if (cause == null) { LOG.info("Closing source " + this.peerClusterZnode + " because: " + reason); } else { LOG.error("Closing source " + this.peerClusterZnode + " because an error occurred: " + reason, cause); } this.sourceRunning = false; Collection<ReplicationSourceShipperThread> workers = workerThreads.values(); for (ReplicationSourceShipperThread worker : workers) { worker.setWorkerRunning(false); worker.entryReader.interrupt(); worker.interrupt(); } ListenableFuture<Service.State> future = null; if (this.replicationEndpoint != null) { future = this.replicationEndpoint.stop(); } if (join) { for (ReplicationSourceShipperThread worker : workers) { Threads.shutdown(worker, this.sleepForRetries); LOG.info("ReplicationSourceWorker " + worker.getName() + " terminated"); } if (future != null) { try { future.get(sleepForRetries * maxRetriesMultiplier, TimeUnit.MILLISECONDS); } catch (Exception e) { LOG.warn("Got exception while waiting for endpoint to shutdown for replication source :" + this.peerClusterZnode, e); } } } } @Override public String getPeerClusterZnode() { return this.peerClusterZnode; } @Override public String getPeerClusterId() { return this.peerId; } @Override public Path getCurrentPath() { // only for testing for (ReplicationSourceShipperThread worker : workerThreads.values()) { if (worker.getCurrentPath() != null) return worker.getCurrentPath(); } return null; } private boolean isSourceActive() { return !this.stopper.isStopped() && this.sourceRunning; } /** * Comparator used to compare logs together based on their start time */ public static class LogsComparator implements Comparator<Path> { @Override public int compare(Path o1, Path o2) { return Long.valueOf(getTS(o1)).compareTo(getTS(o2)); } /** * Split a path to get the start time * For example: 10.20.20.171%3A60020.1277499063250 * @param p path to split * @return start time */ private static long getTS(Path p) { int tsIndex = p.getName().lastIndexOf('.') + 1; return Long.parseLong(p.getName().substring(tsIndex)); } } @Override public String getStats() { StringBuilder sb = new StringBuilder(); sb.append("Total replicated edits: ").append(totalReplicatedEdits) .append(", current progress: \n"); for (Map.Entry<String, ReplicationSourceShipperThread> entry : workerThreads.entrySet()) { String walGroupId = entry.getKey(); ReplicationSourceShipperThread worker = entry.getValue(); long position = worker.getCurrentPosition(); Path currentPath = worker.getCurrentPath(); sb.append("walGroup [").append(walGroupId).append("]: "); if (currentPath != null) { sb.append("currently replicating from: ").append(currentPath).append(" at position: ") .append(position).append("\n"); } else { sb.append("no replication ongoing, waiting for new log"); } } return sb.toString(); } /** * Get Replication Source Metrics * @return sourceMetrics */ public MetricsSource getSourceMetrics() { return this.metrics; } private long getCurrentBandwidth() { ReplicationPeer replicationPeer = this.replicationPeers.getConnectedPeer(peerId); long peerBandwidth = replicationPeer != null ? replicationPeer.getPeerBandwidth() : 0; // user can set peer bandwidth to 0 to use default bandwidth return peerBandwidth != 0 ? peerBandwidth : defaultBandwidth; } // This thread reads entries from a queue and ships them. // Entries are placed onto the queue by ReplicationSourceWALReaderThread public class ReplicationSourceShipperThread extends Thread { ReplicationSourceInterface source; String walGroupId; PriorityBlockingQueue<Path> queue; ReplicationQueueInfo replicationQueueInfo; // Last position in the log that we sent to ZooKeeper private long lastLoggedPosition = -1; // Path of the current log private volatile Path currentPath; // Indicates whether this particular worker is running private boolean workerRunning = true; ReplicationSourceWALReaderThread entryReader; // Use guava cache to set ttl for each key private LoadingCache<String, Boolean> canSkipWaitingSet = CacheBuilder.newBuilder() .expireAfterAccess(1, TimeUnit.DAYS).build( new CacheLoader<String, Boolean>() { @Override public Boolean load(String key) throws Exception { return false; } } ); public ReplicationSourceShipperThread(String walGroupId, PriorityBlockingQueue<Path> queue, ReplicationQueueInfo replicationQueueInfo, ReplicationSourceInterface source) { this.walGroupId = walGroupId; this.queue = queue; this.replicationQueueInfo = replicationQueueInfo; this.source = source; } @Override public void run() { // Loop until we close down while (isWorkerActive()) { int sleepMultiplier = 1; // Sleep until replication is enabled again if (!isPeerEnabled()) { if (sleepForRetries("Replication is disabled", sleepMultiplier)) { sleepMultiplier++; } continue; } while (entryReader == null) { if (sleepForRetries("Replication WAL entry reader thread not initialized", sleepMultiplier)) { sleepMultiplier++; } if (sleepMultiplier == maxRetriesMultiplier) { LOG.warn("Replication WAL entry reader thread not initialized"); } } try { WALEntryBatch entryBatch = entryReader.take(); for (Map.Entry<String, Long> entry : entryBatch.getLastSeqIds().entrySet()) { waitingUntilCanPush(entry); } shipEdits(entryBatch); releaseBufferQuota((int) entryBatch.getHeapSize()); if (replicationQueueInfo.isQueueRecovered() && entryBatch.getWalEntries().isEmpty() && entryBatch.getLastSeqIds().isEmpty()) { LOG.debug("Finished recovering queue for group " + walGroupId + " of peer " + peerClusterZnode); metrics.incrCompletedRecoveryQueue(); setWorkerRunning(false); continue; } } catch (InterruptedException e) { LOG.trace("Interrupted while waiting for next replication entry batch", e); Thread.currentThread().interrupt(); } } if (replicationQueueInfo.isQueueRecovered()) { // use synchronize to make sure one last thread will clean the queue synchronized (workerThreads) { Threads.sleep(100);// wait a short while for other worker thread to fully exit boolean allOtherTaskDone = true; for (ReplicationSourceShipperThread worker : workerThreads.values()) { if (!worker.equals(this) && worker.isAlive()) { allOtherTaskDone = false; break; } } if (allOtherTaskDone) { manager.closeRecoveredQueue(this.source); LOG.info("Finished recovering queue " + peerClusterZnode + " with the following stats: " + getStats()); } } } } private void waitingUntilCanPush(Map.Entry<String, Long> entry) { String key = entry.getKey(); long seq = entry.getValue(); boolean deleteKey = false; if (seq <= 0) { // There is a REGION_CLOSE marker, we can not continue skipping after this entry. deleteKey = true; seq = -seq; } if (!canSkipWaitingSet.getUnchecked(key)) { try { manager.waitUntilCanBePushed(Bytes.toBytes(key), seq, actualPeerId); } catch (IOException e) { LOG.error("waitUntilCanBePushed fail", e); stopper.stop("waitUntilCanBePushed fail"); } catch (InterruptedException e) { LOG.warn("waitUntilCanBePushed interrupted", e); Thread.currentThread().interrupt(); } canSkipWaitingSet.put(key, true); } if (deleteKey) { canSkipWaitingSet.invalidate(key); } } private void cleanUpHFileRefs(WALEdit edit) throws IOException { String peerId = peerClusterZnode; if (peerId.contains("-")) { // peerClusterZnode will be in the form peerId + "-" + rsZNode. // A peerId will not have "-" in its name, see HBASE-11394 peerId = peerClusterZnode.split("-")[0]; } List<Cell> cells = edit.getCells(); int totalCells = cells.size(); for (int i = 0; i < totalCells; i++) { Cell cell = cells.get(i); if (CellUtil.matchingQualifier(cell, WALEdit.BULK_LOAD)) { BulkLoadDescriptor bld = WALEdit.getBulkLoadDescriptor(cell); List<StoreDescriptor> stores = bld.getStoresList(); int totalStores = stores.size(); for (int j = 0; j < totalStores; j++) { List<String> storeFileList = stores.get(j).getStoreFileList(); manager.cleanUpHFileRefs(peerId, storeFileList); metrics.decrSizeOfHFileRefsQueue(storeFileList.size()); } } } } private void checkBandwidthChangeAndResetThrottler() { long peerBandwidth = getCurrentBandwidth(); if (peerBandwidth != currentBandwidth) { currentBandwidth = peerBandwidth; throttler.setBandwidth((double) currentBandwidth / 10.0); LOG.info("ReplicationSource : " + peerId + " bandwidth throttling changed, currentBandWidth=" + currentBandwidth); } } /** * Do the shipping logic */ protected void shipEdits(WALEntryBatch entryBatch) { List<Entry> entries = entryBatch.getWalEntries(); long lastReadPosition = entryBatch.getLastWalPosition(); currentPath = entryBatch.getLastWalPath(); int sleepMultiplier = 0; if (entries.isEmpty()) { if (lastLoggedPosition != lastReadPosition) { // Save positions to meta table before zk. updateSerialRepPositions(entryBatch.getLastSeqIds()); updateLogPosition(lastReadPosition); // if there was nothing to ship and it's not an error // set "ageOfLastShippedOp" to <now> to indicate that we're current metrics.setAgeOfLastShippedOp(EnvironmentEdgeManager.currentTime(), walGroupId); } return; } int currentSize = (int) entryBatch.getHeapSize(); while (isWorkerActive()) { try { checkBandwidthChangeAndResetThrottler(); if (throttler.isEnabled()) { long sleepTicks = throttler.getNextSleepInterval(currentSize); if (sleepTicks > 0) { try { if (LOG.isTraceEnabled()) { LOG.trace("To sleep " + sleepTicks + "ms for throttling control"); } Thread.sleep(sleepTicks); } catch (InterruptedException e) { LOG.debug("Interrupted while sleeping for throttling control"); Thread.currentThread().interrupt(); // current thread might be interrupted to terminate // directly go back to while() for confirm this continue; } // reset throttler's cycle start tick when sleep for throttling occurs throttler.resetStartTick(); } } // create replicateContext here, so the entries can be GC'd upon return from this call // stack ReplicationEndpoint.ReplicateContext replicateContext = new ReplicationEndpoint.ReplicateContext(); replicateContext.setEntries(entries).setSize(currentSize); replicateContext.setWalGroupId(walGroupId); long startTimeNs = System.nanoTime(); // send the edits to the endpoint. Will block until the edits are shipped and acknowledged boolean replicated = replicationEndpoint.replicate(replicateContext); long endTimeNs = System.nanoTime(); if (!replicated) { continue; } else { sleepMultiplier = Math.max(sleepMultiplier - 1, 0); } if (this.lastLoggedPosition != lastReadPosition) { //Clean up hfile references int size = entries.size(); for (int i = 0; i < size; i++) { cleanUpHFileRefs(entries.get(i).getEdit()); } // Save positions to meta table before zk. updateSerialRepPositions(entryBatch.getLastSeqIds()); //Log and clean up WAL logs updateLogPosition(lastReadPosition); } if (throttler.isEnabled()) { throttler.addPushSize(currentSize); } totalReplicatedEdits.addAndGet(entries.size()); totalReplicatedOperations.addAndGet(entryBatch.getNbOperations()); // FIXME check relationship between wal group and overall metrics.shipBatch(entryBatch.getNbOperations(), currentSize, entryBatch.getNbHFiles()); metrics.setAgeOfLastShippedOp(entries.get(entries.size() - 1).getKey().getWriteTime(), walGroupId); if (LOG.isTraceEnabled()) { LOG.trace("Replicated " + totalReplicatedEdits + " entries in total, or " + totalReplicatedOperations + " operations in " + ((endTimeNs - startTimeNs) / 1000000) + " ms"); } break; } catch (Exception ex) { LOG.warn(replicationEndpoint.getClass().getName() + " threw unknown exception:" + org.apache.hadoop.util.StringUtils.stringifyException(ex)); if (sleepForRetries("ReplicationEndpoint threw exception", sleepMultiplier)) { sleepMultiplier++; } } } } private void updateSerialRepPositions(Map<String, Long> lastPositionsForSerialScope) { try { MetaTableAccessor.updateReplicationPositions(manager.getConnection(), actualPeerId, lastPositionsForSerialScope); } catch (IOException e) { LOG.error("updateReplicationPositions fail", e); stopper.stop("updateReplicationPositions fail"); } } private void updateLogPosition(long lastReadPosition) { manager.logPositionAndCleanOldLogs(currentPath, peerClusterZnode, lastReadPosition, this.replicationQueueInfo.isQueueRecovered(), false); lastLoggedPosition = lastReadPosition; } public void startup() { String n = Thread.currentThread().getName(); Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() { @Override public void uncaughtException(final Thread t, final Throwable e) { RSRpcServices.exitIfOOME(e); LOG.error("Unexpected exception in ReplicationSourceWorkerThread," + " currentPath=" + getCurrentPath(), e); stopper.stop("Unexpected exception in ReplicationSourceWorkerThread"); } }; Threads.setDaemonThreadRunning(this, n + ".replicationSource." + walGroupId + "," + peerClusterZnode, handler); workerThreads.put(walGroupId, this); long startPosition = 0; if (this.replicationQueueInfo.isQueueRecovered()) { startPosition = getRecoveredQueueStartPos(startPosition); int numRetries = 0; while (numRetries <= maxRetriesMultiplier) { try { locateRecoveredPaths(); break; } catch (IOException e) { LOG.error("Error while locating recovered queue paths, attempt #" + numRetries); numRetries++; } } } startWALReaderThread(n, handler, startPosition); } // If this is a recovered queue, the queue is already full and the first log // normally has a position (unless the RS failed between 2 logs) private long getRecoveredQueueStartPos(long startPosition) { try { startPosition = (replicationQueues.getLogPosition(peerClusterZnode, this.queue.peek().getName())); if (LOG.isTraceEnabled()) { LOG.trace("Recovered queue started with log " + this.queue.peek() + " at position " + startPosition); } } catch (ReplicationException e) { terminate("Couldn't get the position of this recovered queue " + peerClusterZnode, e); } return startPosition; } // start a background thread to read and batch entries private void startWALReaderThread(String threadName, Thread.UncaughtExceptionHandler handler, long startPosition) { ArrayList<WALEntryFilter> filters = Lists.newArrayList(walEntryFilter, new ClusterMarkingEntryFilter(clusterId, peerClusterId, replicationEndpoint)); ChainWALEntryFilter readerFilter = new ChainWALEntryFilter(filters); entryReader = new ReplicationSourceWALReaderThread(manager, replicationQueueInfo, queue, startPosition, fs, conf, readerFilter, metrics); Threads.setDaemonThreadRunning(entryReader, threadName + ".replicationSource.replicationWALReaderThread." + walGroupId + "," + peerClusterZnode, handler); } // Loops through the recovered queue and tries to find the location of each log // this is necessary because the logs may have moved before recovery was initiated private void locateRecoveredPaths() throws IOException { boolean hasPathChanged = false; PriorityBlockingQueue<Path> newPaths = new PriorityBlockingQueue<Path>(queueSizePerGroup, new LogsComparator()); pathsLoop: for (Path path : queue) { if (fs.exists(path)) { // still in same location, don't need to do anything newPaths.add(path); continue; } // Path changed - try to find the right path. hasPathChanged = true; if (stopper instanceof ReplicationSyncUp.DummyServer) { // In the case of disaster/recovery, HMaster may be shutdown/crashed before flush data // from .logs to .oldlogs. Loop into .logs folders and check whether a match exists Path newPath = getReplSyncUpPath(path); newPaths.add(newPath); continue; } else { // See if Path exists in the dead RS folder (there could be a chain of failures // to look at) List<String> deadRegionServers = this.replicationQueueInfo.getDeadRegionServers(); LOG.info("NB dead servers : " + deadRegionServers.size()); final Path walDir = FSUtils.getWALRootDir(conf); for (String curDeadServerName : deadRegionServers) { final Path deadRsDirectory = new Path(walDir, AbstractFSWALProvider.getWALDirectoryName(curDeadServerName)); Path[] locs = new Path[] { new Path(deadRsDirectory, path.getName()), new Path( deadRsDirectory.suffix(AbstractFSWALProvider.SPLITTING_EXT), path.getName()) }; for (Path possibleLogLocation : locs) { LOG.info("Possible location " + possibleLogLocation.toUri().toString()); if (manager.getFs().exists(possibleLogLocation)) { // We found the right new location LOG.info("Log " + path + " still exists at " + possibleLogLocation); newPaths.add(possibleLogLocation); continue pathsLoop; } } } // didn't find a new location LOG.error( String.format("WAL Path %s doesn't exist and couldn't find its new location", path)); newPaths.add(path); } } if (hasPathChanged) { if (newPaths.size() != queue.size()) { // this shouldn't happen LOG.error("Recovery queue size is incorrect"); throw new IOException("Recovery queue size error"); } // put the correct locations in the queue // since this is a recovered queue with no new incoming logs, // there shouldn't be any concurrency issues queue.clear(); for (Path path : newPaths) { queue.add(path); } } } // N.B. the ReplicationSyncUp tool sets the manager.getWALDir to the root of the wal // area rather than to the wal area for a particular region server. private Path getReplSyncUpPath(Path path) throws IOException { FileStatus[] rss = fs.listStatus(manager.getLogDir()); for (FileStatus rs : rss) { Path p = rs.getPath(); FileStatus[] logs = fs.listStatus(p); for (FileStatus log : logs) { p = new Path(p, log.getPath().getName()); if (p.getName().equals(path.getName())) { LOG.info("Log " + p.getName() + " found at " + p); return p; } } } LOG.error("Didn't find path for: " + path.getName()); return path; } public Path getCurrentPath() { return this.currentPath; } public long getCurrentPosition() { return this.lastLoggedPosition; } private boolean isWorkerActive() { return !stopper.isStopped() && workerRunning && !isInterrupted(); } private void terminate(String reason, Exception cause) { if (cause == null) { LOG.info("Closing worker for wal group " + this.walGroupId + " because: " + reason); } else { LOG.error("Closing worker for wal group " + this.walGroupId + " because an error occurred: " + reason, cause); } entryReader.interrupt(); Threads.shutdown(entryReader, sleepForRetries); this.interrupt(); Threads.shutdown(this, sleepForRetries); LOG.info("ReplicationSourceWorker " + this.getName() + " terminated"); } public void setWorkerRunning(boolean workerRunning) { entryReader.setReaderRunning(workerRunning); this.workerRunning = workerRunning; } private void releaseBufferQuota(int size) { totalBufferUsed.addAndGet(-size); } } }