/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.client; import static org.apache.hadoop.hbase.HConstants.CATALOG_FAMILY; import static org.apache.hadoop.hbase.HConstants.NINES; import static org.apache.hadoop.hbase.HConstants.ZEROES; import static org.apache.hadoop.hbase.HRegionInfo.createRegionName; import static org.apache.hadoop.hbase.TableName.META_TABLE_NAME; import static org.apache.hadoop.hbase.client.ConnectionUtils.createClosestRowAfter; import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow; import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR; import static org.apache.hadoop.hbase.util.CollectionUtils.computeIfAbsent; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentNavigableMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.ThreadLocalRandom; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.RegionLocations; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.util.Bytes; /** * The asynchronous locator for regions other than meta. */ @InterfaceAudience.Private class AsyncNonMetaRegionLocator { private static final Log LOG = LogFactory.getLog(AsyncNonMetaRegionLocator.class); static final String MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE = "hbase.client.meta.max.concurrent.locate.per.table"; private static final int DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE = 8; private final AsyncConnectionImpl conn; private final int maxConcurrentLocateRequestPerTable; private final ConcurrentMap<TableName, TableCache> cache = new ConcurrentHashMap<>(); private static final class LocateRequest { public final byte[] row; public final RegionLocateType locateType; public LocateRequest(byte[] row, RegionLocateType locateType) { this.row = row; this.locateType = locateType; } @Override public int hashCode() { return Bytes.hashCode(row) ^ locateType.hashCode(); } @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != LocateRequest.class) { return false; } LocateRequest that = (LocateRequest) obj; return locateType.equals(that.locateType) && Bytes.equals(row, that.row); } } private static final class TableCache { public final ConcurrentNavigableMap<byte[], HRegionLocation> cache = new ConcurrentSkipListMap<>(BYTES_COMPARATOR); public final Set<LocateRequest> pendingRequests = new HashSet<>(); public final Map<LocateRequest, CompletableFuture<HRegionLocation>> allRequests = new HashMap<>(); public boolean hasQuota(int max) { return pendingRequests.size() < max; } public boolean isPending(LocateRequest req) { return pendingRequests.contains(req); } public void send(LocateRequest req) { pendingRequests.add(req); } } AsyncNonMetaRegionLocator(AsyncConnectionImpl conn) { this.conn = conn; this.maxConcurrentLocateRequestPerTable = conn.getConfiguration().getInt( MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE); } private TableCache getTableCache(TableName tableName) { return computeIfAbsent(cache, tableName, TableCache::new); } private void removeFromCache(HRegionLocation loc) { TableCache tableCache = cache.get(loc.getRegionInfo().getTable()); if (tableCache == null) { return; } tableCache.cache.computeIfPresent(loc.getRegionInfo().getStartKey(), (k, oldLoc) -> { if (oldLoc.getSeqNum() > loc.getSeqNum() || !oldLoc.getServerName().equals(loc.getServerName())) { return oldLoc; } return null; }); } // return whether we add this loc to cache private boolean addToCache(TableCache tableCache, HRegionLocation loc) { if (LOG.isTraceEnabled()) { LOG.trace("Try adding " + loc + " to cache"); } byte[] startKey = loc.getRegionInfo().getStartKey(); HRegionLocation oldLoc = tableCache.cache.putIfAbsent(startKey, loc); if (oldLoc == null) { return true; } if (oldLoc.getSeqNum() > loc.getSeqNum() || oldLoc.getServerName().equals(loc.getServerName())) { if (LOG.isTraceEnabled()) { LOG.trace("Will not add " + loc + " to cache because the old value " + oldLoc + " is newer than us or has the same server name"); } return false; } return loc == tableCache.cache.compute(startKey, (k, oldValue) -> { if (oldValue == null || oldValue.getSeqNum() <= loc.getSeqNum()) { return loc; } if (LOG.isTraceEnabled()) { LOG.trace("Will not add " + loc + " to cache because the old value " + oldValue + " is newer than us or has the same server name." + " Maybe it is updated before we replace it"); } return oldValue; }); } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UPM_UNCALLED_PRIVATE_METHOD", justification = "Called by lambda expression") private void addToCache(HRegionLocation loc) { addToCache(getTableCache(loc.getRegionInfo().getTable()), loc); if (LOG.isTraceEnabled()) { LOG.trace("Try adding " + loc + " to cache"); } } private boolean tryComplete(LocateRequest req, CompletableFuture<HRegionLocation> future, HRegionLocation loc) { if (future.isDone()) { return true; } boolean completed; if (req.locateType.equals(RegionLocateType.BEFORE)) { // for locating the row before current row, the common case is to find the previous region in // reverse scan, so we check the endKey first. In general, the condition should be startKey < // req.row and endKey >= req.row. Here we split it to endKey == req.row || (endKey > req.row // && startKey < req.row). The two conditions are equal since startKey < endKey. int c = Bytes.compareTo(loc.getRegionInfo().getEndKey(), req.row); completed = c == 0 || (c > 0 && Bytes.compareTo(loc.getRegionInfo().getStartKey(), req.row) < 0); } else { completed = loc.getRegionInfo().containsRow(req.row); } if (completed) { future.complete(loc); return true; } else { return false; } } private void complete(TableName tableName, LocateRequest req, HRegionLocation loc, Throwable error) { if (error != null) { if (LOG.isDebugEnabled()) { LOG.debug("Failed to locate region in '" + tableName + "', row='" + Bytes.toStringBinary(req.row) + "', locateType=" + req.locateType, error); } } LocateRequest toSend = null; TableCache tableCache = getTableCache(tableName); if (loc != null) { if (!addToCache(tableCache, loc)) { // someone is ahead of us. synchronized (tableCache) { tableCache.pendingRequests.remove(req); } return; } } synchronized (tableCache) { tableCache.pendingRequests.remove(req); if (error instanceof DoNotRetryIOException) { CompletableFuture<?> future = tableCache.allRequests.remove(req); if (future != null) { future.completeExceptionally(error); } } if (loc != null) { for (Iterator<Map.Entry<LocateRequest, CompletableFuture<HRegionLocation>>> iter = tableCache.allRequests.entrySet().iterator(); iter.hasNext();) { Map.Entry<LocateRequest, CompletableFuture<HRegionLocation>> entry = iter.next(); if (tryComplete(entry.getKey(), entry.getValue(), loc)) { iter.remove(); } } } if (!tableCache.allRequests.isEmpty() && tableCache.hasQuota(maxConcurrentLocateRequestPerTable)) { LocateRequest[] candidates = tableCache.allRequests.keySet().stream() .filter(r -> !tableCache.isPending(r)).toArray(LocateRequest[]::new); if (candidates.length > 0) { // TODO: use a better algorithm to send a request which is more likely to fetch a new // location. toSend = candidates[ThreadLocalRandom.current().nextInt(candidates.length)]; tableCache.send(toSend); } } } if (toSend != null) { locateInMeta(tableName, toSend); } } private void onScanComplete(TableName tableName, LocateRequest req, List<Result> results, Throwable error) { if (error != null) { complete(tableName, req, null, error); return; } if (results.isEmpty()) { complete(tableName, req, null, new TableNotFoundException(tableName)); return; } RegionLocations locs = MetaTableAccessor.getRegionLocations(results.get(0)); if (LOG.isDebugEnabled()) { LOG.debug("The fetched location of '" + tableName + "', row='" + Bytes.toStringBinary(req.row) + "', locateType=" + req.locateType + " is " + locs); } if (locs == null || locs.getDefaultRegionLocation() == null) { complete(tableName, req, null, new IOException(String.format("No location found for '%s', row='%s', locateType=%s", tableName, Bytes.toStringBinary(req.row), req.locateType))); return; } HRegionLocation loc = locs.getDefaultRegionLocation(); HRegionInfo info = loc.getRegionInfo(); if (info == null) { complete(tableName, req, null, new IOException(String.format("HRegionInfo is null for '%s', row='%s', locateType=%s", tableName, Bytes.toStringBinary(req.row), req.locateType))); return; } if (!info.getTable().equals(tableName)) { complete(tableName, req, null, new TableNotFoundException( "Table '" + tableName + "' was not found, got: '" + info.getTable() + "'")); return; } if (info.isSplit()) { complete(tableName, req, null, new RegionOfflineException( "the only available region for the required row is a split parent," + " the daughters should be online soon: '" + info.getRegionNameAsString() + "'")); return; } if (info.isOffline()) { complete(tableName, req, null, new RegionOfflineException("the region is offline, could" + " be caused by a disable table call: '" + info.getRegionNameAsString() + "'")); return; } if (loc.getServerName() == null) { complete(tableName, req, null, new NoServerForRegionException( String.format("No server address listed for region '%s', row='%s', locateType=%s", info.getRegionNameAsString(), Bytes.toStringBinary(req.row), req.locateType))); return; } complete(tableName, req, loc, null); } private HRegionLocation locateRowInCache(TableCache tableCache, TableName tableName, byte[] row) { Map.Entry<byte[], HRegionLocation> entry = tableCache.cache.floorEntry(row); if (entry == null) { return null; } HRegionLocation loc = entry.getValue(); byte[] endKey = loc.getRegionInfo().getEndKey(); if (isEmptyStopRow(endKey) || Bytes.compareTo(row, endKey) < 0) { if (LOG.isTraceEnabled()) { LOG.trace("Found " + loc + " in cache for '" + tableName + "', row='" + Bytes.toStringBinary(row) + "', locateType=" + RegionLocateType.CURRENT); } return loc; } else { return null; } } private HRegionLocation locateRowBeforeInCache(TableCache tableCache, TableName tableName, byte[] row) { Map.Entry<byte[], HRegionLocation> entry = isEmptyStopRow(row) ? tableCache.cache.lastEntry() : tableCache.cache.lowerEntry(row); if (entry == null) { return null; } HRegionLocation loc = entry.getValue(); if (isEmptyStopRow(loc.getRegionInfo().getEndKey()) || Bytes.compareTo(loc.getRegionInfo().getEndKey(), row) >= 0) { if (LOG.isTraceEnabled()) { LOG.trace("Found " + loc + " in cache for '" + tableName + "', row='" + Bytes.toStringBinary(row) + "', locateType=" + RegionLocateType.BEFORE); } return loc; } else { return null; } } private void locateInMeta(TableName tableName, LocateRequest req) { if (LOG.isTraceEnabled()) { LOG.trace("Try locate '" + tableName + "', row='" + Bytes.toStringBinary(req.row) + "', locateType=" + req.locateType + " in meta"); } byte[] metaKey; if (req.locateType.equals(RegionLocateType.BEFORE)) { if (isEmptyStopRow(req.row)) { byte[] binaryTableName = tableName.getName(); metaKey = Arrays.copyOf(binaryTableName, binaryTableName.length + 1); } else { metaKey = createRegionName(tableName, req.row, ZEROES, false); } } else { metaKey = createRegionName(tableName, req.row, NINES, false); } conn.getRawTable(META_TABLE_NAME) .scanAll(new Scan().withStartRow(metaKey).setReversed(true).addFamily(CATALOG_FAMILY) .setOneRowLimit()) .whenComplete((results, error) -> onScanComplete(tableName, req, results, error)); } private HRegionLocation locateInCache(TableCache tableCache, TableName tableName, byte[] row, RegionLocateType locateType) { return locateType.equals(RegionLocateType.BEFORE) ? locateRowBeforeInCache(tableCache, tableName, row) : locateRowInCache(tableCache, tableName, row); } // locateToPrevious is true means we will use the start key of a region to locate the region // placed before it. Used for reverse scan. See the comment of // AsyncRegionLocator.getPreviousRegionLocation. private CompletableFuture<HRegionLocation> getRegionLocationInternal(TableName tableName, byte[] row, RegionLocateType locateType) { // AFTER should be convert to CURRENT before calling this method assert !locateType.equals(RegionLocateType.AFTER); TableCache tableCache = getTableCache(tableName); HRegionLocation loc = locateInCache(tableCache, tableName, row, locateType); if (loc != null) { return CompletableFuture.completedFuture(loc); } CompletableFuture<HRegionLocation> future; LocateRequest req; boolean sendRequest = false; synchronized (tableCache) { // check again loc = locateInCache(tableCache, tableName, row, locateType); if (loc != null) { return CompletableFuture.completedFuture(loc); } req = new LocateRequest(row, locateType); future = tableCache.allRequests.get(req); if (future == null) { future = new CompletableFuture<>(); tableCache.allRequests.put(req, future); if (tableCache.hasQuota(maxConcurrentLocateRequestPerTable) && !tableCache.isPending(req)) { tableCache.send(req); sendRequest = true; } } } if (sendRequest) { locateInMeta(tableName, req); } return future; } CompletableFuture<HRegionLocation> getRegionLocation(TableName tableName, byte[] row, RegionLocateType locateType) { if (locateType.equals(RegionLocateType.BEFORE)) { return getRegionLocationInternal(tableName, row, locateType); } else { // as we know the exact row after us, so we can just create the new row, and use the same // algorithm to locate it. if (locateType.equals(RegionLocateType.AFTER)) { row = createClosestRowAfter(row); } return getRegionLocationInternal(tableName, row, RegionLocateType.CURRENT); } } void updateCachedLocation(HRegionLocation loc, Throwable exception) { AsyncRegionLocator.updateCachedLocation(loc, exception, l -> { TableCache tableCache = cache.get(l.getRegionInfo().getTable()); if (tableCache == null) { return null; } return tableCache.cache.get(l.getRegionInfo().getStartKey()); }, this::addToCache, this::removeFromCache); } void clearCache(TableName tableName) { TableCache tableCache = cache.remove(tableName); if (tableCache == null) { return; } synchronized (tableCache) { if (!tableCache.allRequests.isEmpty()) { IOException error = new IOException("Cache cleared"); tableCache.allRequests.values().forEach(f -> f.completeExceptionally(error)); } } } }