ReplicationSourceWALReaderThread.java example

Explorer
hbase-master
/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.replication.regionserver;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor;
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
import org.apache.hadoop.hbase.replication.WALEntryFilter;
import org.apache.hadoop.hbase.replication.regionserver.WALEntryStream.WALEntryStreamRuntimeException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL.Entry;

/**
 * Reads and filters WAL entries, groups the filtered entries into batches, and puts the batches onto a queue
 *
 */
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class ReplicationSourceWALReaderThread extends Thread {
  private static final Log LOG = LogFactory.getLog(ReplicationSourceWALReaderThread.class);

  private PriorityBlockingQueue<Path> logQueue;
  private FileSystem fs;
  private Configuration conf;
  private BlockingQueue<WALEntryBatch> entryBatchQueue;
  // max (heap) size of each batch - multiply by number of batches in queue to get total
  private long replicationBatchSizeCapacity;
  // max count of each batch - multiply by number of batches in queue to get total
  private int replicationBatchCountCapacity;
  // position in the WAL to start reading at
  private long currentPosition;
  private WALEntryFilter filter;
  private long sleepForRetries;
  //Indicates whether this particular worker is running
  private boolean isReaderRunning = true;
  private ReplicationQueueInfo replicationQueueInfo;
  private int maxRetriesMultiplier;
  private MetricsSource metrics;

  private AtomicLong totalBufferUsed;
  private long totalBufferQuota;

  /**
   * Creates a reader worker for a given WAL queue. Reads WAL entries off a given queue, batches the
   * entries, and puts them on a batch queue.
   * @param manager replication manager
   * @param replicationQueueInfo
   * @param logQueue The WAL queue to read off of
   * @param startPosition position in the first WAL to start reading from
   * @param fs the files system to use
   * @param conf configuration to use
   * @param filter The filter to use while reading
   * @param metrics replication metrics
   */
  public ReplicationSourceWALReaderThread(ReplicationSourceManager manager,
      ReplicationQueueInfo replicationQueueInfo, PriorityBlockingQueue<Path> logQueue,
      long startPosition,
      FileSystem fs, Configuration conf, WALEntryFilter filter, MetricsSource metrics) {
    this.replicationQueueInfo = replicationQueueInfo;
    this.logQueue = logQueue;
    this.currentPosition = startPosition;
    this.fs = fs;
    this.conf = conf;
    this.filter = filter;
    this.replicationBatchSizeCapacity =
        this.conf.getLong("replication.source.size.capacity", 1024 * 1024 * 64);
    this.replicationBatchCountCapacity = this.conf.getInt("replication.source.nb.capacity", 25000);
    // memory used will be batchSizeCapacity * (nb.batches + 1)
    // the +1 is for the current thread reading before placing onto the queue
    int batchCount = conf.getInt("replication.source.nb.batches", 1);
    this.totalBufferUsed = manager.getTotalBufferUsed();
    this.totalBufferQuota = conf.getLong(HConstants.REPLICATION_SOURCE_TOTAL_BUFFER_KEY,
      HConstants.REPLICATION_SOURCE_TOTAL_BUFFER_DFAULT);
    this.sleepForRetries =
        this.conf.getLong("replication.source.sleepforretries", 1000);    // 1 second
    this.maxRetriesMultiplier =
        this.conf.getInt("replication.source.maxretriesmultiplier", 300); // 5 minutes @ 1 sec per
    this.metrics = metrics;
    this.entryBatchQueue = new LinkedBlockingQueue<>(batchCount);
    LOG.info("peerClusterZnode=" + replicationQueueInfo.getPeerClusterZnode()
        + ", ReplicationSourceWALReaderThread : " + replicationQueueInfo.getPeerId()
        + " inited, replicationBatchSizeCapacity=" + replicationBatchSizeCapacity
        + ", replicationBatchCountCapacity=" + replicationBatchCountCapacity
        + ", replicationBatchQueueCapacity=" + batchCount);
  }

  @Override
  public void run() {
    int sleepMultiplier = 1;
    while (isReaderRunning()) { // we only loop back here if something fatal happened to our stream
      try (WALEntryStream entryStream =
          new WALEntryStream(logQueue, fs, conf, currentPosition, metrics)) {
        while (isReaderRunning()) { // loop here to keep reusing stream while we can
          if (!checkQuota()) {
            continue;
          }
          WALEntryBatch batch = null;
          while (entryStream.hasNext()) {
            if (batch == null) {
              batch = new WALEntryBatch(replicationBatchCountCapacity, entryStream.getCurrentPath());
            }
            Entry entry = entryStream.next();
            if (updateSerialReplPos(batch, entry)) {
              batch.lastWalPosition = entryStream.getPosition();
              break;
            }
            entry = filterEntry(entry);
            if (entry != null) {
              WALEdit edit = entry.getEdit();
              if (edit != null && !edit.isEmpty()) {
                long entrySize = getEntrySize(entry);
                batch.addEntry(entry);
                updateBatchStats(batch, entry, entryStream.getPosition(), entrySize);
                boolean totalBufferTooLarge = acquireBufferQuota(entrySize);
                // Stop if too many entries or too big
                if (totalBufferTooLarge || batch.getHeapSize() >= replicationBatchSizeCapacity
                    || batch.getNbEntries() >= replicationBatchCountCapacity) {
                  break;
                }
              }
            }
          }
          if (batch != null && (!batch.getLastSeqIds().isEmpty() || batch.getNbEntries() > 0)) {
            if (LOG.isTraceEnabled()) {
              LOG.trace(String.format("Read %s WAL entries eligible for replication",
                batch.getNbEntries()));
            }
            entryBatchQueue.put(batch);
            sleepMultiplier = 1;
          } else { // got no entries and didn't advance position in WAL
            LOG.trace("Didn't read any new entries from WAL");
            if (replicationQueueInfo.isQueueRecovered()) {
              // we're done with queue recovery, shut ourself down
              setReaderRunning(false);
              // shuts down shipper thread immediately
              entryBatchQueue.put(batch != null ? batch
                  : new WALEntryBatch(replicationBatchCountCapacity, entryStream.getCurrentPath()));
            } else {
              Thread.sleep(sleepForRetries);
            }
          }
          currentPosition = entryStream.getPosition();
          entryStream.reset(); // reuse stream
        }
      } catch (IOException | WALEntryStreamRuntimeException e) { // stream related
        if (sleepMultiplier < maxRetriesMultiplier) {
          LOG.debug("Failed to read stream of replication entries: " + e);
          sleepMultiplier++;
        } else {
          LOG.error("Failed to read stream of replication entries", e);
        }
        Threads.sleep(sleepForRetries * sleepMultiplier);
      } catch (InterruptedException e) {
        LOG.trace("Interrupted while sleeping between WAL reads");
        Thread.currentThread().interrupt();
      }
    }
  }

  //returns false if we've already exceeded the global quota
  private boolean checkQuota() {
    // try not to go over total quota
    if (totalBufferUsed.get() > totalBufferQuota) {
      Threads.sleep(sleepForRetries);
      return false;
    }
    return true;
  }

  private Entry filterEntry(Entry entry) {
    Entry filtered = filter.filter(entry);
    if (entry != null && filtered == null) {
      metrics.incrLogEditsFiltered();
    }
    return filtered;
  }

  /**
   * @return true if we should stop reading because we're at REGION_CLOSE
   */
  private boolean updateSerialReplPos(WALEntryBatch batch, Entry entry) throws IOException {
    if (entry.hasSerialReplicationScope()) {
      String key = Bytes.toString(entry.getKey().getEncodedRegionName());
      batch.setLastPosition(key, entry.getKey().getSequenceId());
      if (!entry.getEdit().getCells().isEmpty()) {
        WALProtos.RegionEventDescriptor maybeEvent =
            WALEdit.getRegionEventDescriptor(entry.getEdit().getCells().get(0));
        if (maybeEvent != null && maybeEvent
            .getEventType() == WALProtos.RegionEventDescriptor.EventType.REGION_CLOSE) {
          // In serially replication, if we move a region to another RS and move it back, we may
          // read logs crossing two sections. We should break at REGION_CLOSE and push the first
          // section first in case of missing the middle section belonging to the other RS.
          // In a worker thread, if we can push the first log of a region, we can push all logs
          // in the same region without waiting until we read a close marker because next time
          // we read logs in this region, it must be a new section and not adjacent with this
          // region. Mark it negative.
          batch.setLastPosition(key, -entry.getKey().getSequenceId());
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Retrieves the next batch of WAL entries from the queue, waiting up to the specified time for a
   * batch to become available
   * @return A batch of entries, along with the position in the log after reading the batch
   * @throws InterruptedException if interrupted while waiting
   */
  public WALEntryBatch take() throws InterruptedException {
    return entryBatchQueue.take();
  }

  private long getEntrySize(Entry entry) {
    WALEdit edit = entry.getEdit();
    return edit.heapSize() + calculateTotalSizeOfStoreFiles(edit);
  }

  private void updateBatchStats(WALEntryBatch batch, Entry entry, long entryPosition, long entrySize) {
    WALEdit edit = entry.getEdit();
    if (edit != null && !edit.isEmpty()) {
      batch.incrementHeapSize(entrySize);
      Pair<Integer, Integer> nbRowsAndHFiles = countDistinctRowKeysAndHFiles(edit);
      batch.incrementNbRowKeys(nbRowsAndHFiles.getFirst());
      batch.incrementNbHFiles(nbRowsAndHFiles.getSecond());
    }
    batch.lastWalPosition = entryPosition;
  }

  /**
   * Count the number of different row keys in the given edit because of mini-batching. We assume
   * that there's at least one Cell in the WALEdit.
   * @param edit edit to count row keys from
   * @return number of different row keys and HFiles
   */
  private Pair<Integer, Integer> countDistinctRowKeysAndHFiles(WALEdit edit) {
    List<Cell> cells = edit.getCells();
    int distinctRowKeys = 1;
    int totalHFileEntries = 0;
    Cell lastCell = cells.get(0);

    int totalCells = edit.size();
    for (int i = 0; i < totalCells; i++) {
      // Count HFiles to be replicated
      if (CellUtil.matchingQualifier(cells.get(i), WALEdit.BULK_LOAD)) {
        try {
          BulkLoadDescriptor bld = WALEdit.getBulkLoadDescriptor(cells.get(i));
          List<StoreDescriptor> stores = bld.getStoresList();
          int totalStores = stores.size();
          for (int j = 0; j < totalStores; j++) {
            totalHFileEntries += stores.get(j).getStoreFileList().size();
          }
        } catch (IOException e) {
          LOG.error("Failed to deserialize bulk load entry from wal edit. "
              + "Then its hfiles count will not be added into metric.");
        }
      }

      if (!CellUtil.matchingRows(cells.get(i), lastCell)) {
        distinctRowKeys++;
      }
      lastCell = cells.get(i);
    }

    Pair<Integer, Integer> result = new Pair<>(distinctRowKeys, totalHFileEntries);
    return result;
  }

  /**
   * Calculate the total size of all the store files
   * @param edit edit to count row keys from
   * @return the total size of the store files
   */
  private int calculateTotalSizeOfStoreFiles(WALEdit edit) {
    List<Cell> cells = edit.getCells();
    int totalStoreFilesSize = 0;

    int totalCells = edit.size();
    for (int i = 0; i < totalCells; i++) {
      if (CellUtil.matchingQualifier(cells.get(i), WALEdit.BULK_LOAD)) {
        try {
          BulkLoadDescriptor bld = WALEdit.getBulkLoadDescriptor(cells.get(i));
          List<StoreDescriptor> stores = bld.getStoresList();
          int totalStores = stores.size();
          for (int j = 0; j < totalStores; j++) {
            totalStoreFilesSize += stores.get(j).getStoreFileSizeBytes();
          }
        } catch (IOException e) {
          LOG.error("Failed to deserialize bulk load entry from wal edit. "
              + "Size of HFiles part of cell will not be considered in replication "
              + "request size calculation.",
            e);
        }
      }
    }
    return totalStoreFilesSize;
  }

  /**
   * @param size delta size for grown buffer
   * @return true if we should clear buffer and push all
   */
  private boolean acquireBufferQuota(long size) {
    return totalBufferUsed.addAndGet(size) >= totalBufferQuota;
  }

  /**
   * @return whether the reader thread is running
   */
  public boolean isReaderRunning() {
    return isReaderRunning && !isInterrupted();
  }

  /**
   * @param readerRunning the readerRunning to set
   */
  public void setReaderRunning(boolean readerRunning) {
    this.isReaderRunning = readerRunning;
  }

  /**
   * Holds a batch of WAL entries to replicate, along with some statistics
   *
   */
  static class WALEntryBatch {
    private List<Entry> walEntries;
    // last WAL that was read
    private Path lastWalPath;
    // position in WAL of last entry in this batch
    private long lastWalPosition = 0;
    // number of distinct row keys in this batch
    private int nbRowKeys = 0;
    // number of HFiles
    private int nbHFiles = 0;
    // heap size of data we need to replicate
    private long heapSize = 0;
    // save the last sequenceid for each region if the table has serial-replication scope
    private Map<String, Long> lastSeqIds = new HashMap<>();

    /**
     * @param walEntries
     * @param lastWalPath Path of the WAL the last entry in this batch was read from
     * @param lastWalPosition Position in the WAL the last entry in this batch was read from
     */
    private WALEntryBatch(int maxNbEntries, Path lastWalPath) {
      this.walEntries = new ArrayList<>(maxNbEntries);
      this.lastWalPath = lastWalPath;
    }

    public void addEntry(Entry entry) {
      walEntries.add(entry);
    }

    /**
     * @return the WAL Entries.
     */
    public List<Entry> getWalEntries() {
      return walEntries;
    }

    /**
     * @return the path of the last WAL that was read.
     */
    public Path getLastWalPath() {
      return lastWalPath;
    }

    /**
     * @return the position in the last WAL that was read.
     */
    public long getLastWalPosition() {
      return lastWalPosition;
    }

    public int getNbEntries() {
      return walEntries.size();
    }

    /**
     * @return the number of distinct row keys in this batch
     */
    public int getNbRowKeys() {
      return nbRowKeys;
    }

    /**
     * @return the number of HFiles in this batch
     */
    public int getNbHFiles() {
      return nbHFiles;
    }

    /**
     * @return total number of operations in this batch
     */
    public int getNbOperations() {
      return getNbRowKeys() + getNbHFiles();
    }

    /**
     * @return the heap size of this batch
     */
    public long getHeapSize() {
      return heapSize;
    }

    /**
     * @return the last sequenceid for each region if the table has serial-replication scope
     */
    public Map<String, Long> getLastSeqIds() {
      return lastSeqIds;
    }

    private void incrementNbRowKeys(int increment) {
      nbRowKeys += increment;
    }

    private void incrementNbHFiles(int increment) {
      nbHFiles += increment;
    }

    private void incrementHeapSize(long increment) {
      heapSize += increment;
    }

    private void setLastPosition(String region, Long sequenceId) {
      getLastSeqIds().put(region, sequenceId);
    }
  }
}