/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
import io.hops.exception.StorageException;
import io.hops.exception.TransactionContextException;
import io.hops.metadata.HdfsStorageFactory;
import io.hops.metadata.HdfsVariables;
import io.hops.metadata.common.entity.Variable;
import io.hops.metadata.hdfs.dal.BlockInfoDataAccess;
import io.hops.metadata.hdfs.dal.UnderReplicatedBlockDataAccess;
import io.hops.metadata.hdfs.entity.UnderReplicatedBlock;
import io.hops.transaction.EntityManager;
import io.hops.transaction.handler.HDFSOperationType;
import io.hops.transaction.handler.HopsTransactionalRequestHandler;
import io.hops.transaction.handler.LightWeightRequestHandler;
import io.hops.transaction.lock.LockFactory;
import io.hops.transaction.lock.TransactionLockTypes;
import io.hops.transaction.lock.TransactionLocks;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
/**
* Keep prioritized queues of under replicated blocks.
* Blocks have replication priority, with priority {@link
* #QUEUE_HIGHEST_PRIORITY}
* indicating the highest priority.
* </p>
* Having a prioritised queue allows the {@link BlockManager} to select
* which blocks to replicate first -it tries to give priority to data
* that is most at risk or considered most valuable.
* <p/>
* <p/>
* The policy for choosing which priority to give added blocks
* is implemented in {@link #getPriority(Block, int, int, int)}.
* </p>
* <p>The queue order is as follows:</p>
* <ol>
* <li>{@link #QUEUE_HIGHEST_PRIORITY}: the blocks that must be replicated
* first. That is blocks with only one copy, or blocks with zero live
* copies but a copy in a node being decommissioned. These blocks
* are at risk of loss if the disk or server on which they
* remain fails.</li>
* <li>{@link #QUEUE_VERY_UNDER_REPLICATED}: blocks that are very
* under-replicated compared to their expected values. Currently
* that means the ratio of the ratio of actual:expected means that
* there is <i>less than</i> 1:3.</li>. These blocks may not be at risk,
* but they are clearly considered "important".
* <li>{@link #QUEUE_UNDER_REPLICATED}: blocks that are also under
* replicated, and the ratio of actual:expected is good enough that
* they do not need to go into the {@link #QUEUE_VERY_UNDER_REPLICATED}
* queue.</li>
* <li>{@link #QUEUE_REPLICAS_BADLY_DISTRIBUTED}: there are as least as
* many copies of a block as required, but the blocks are not adequately
* distributed. Loss of a rack/switch could take all copies off-line.</li>
* <li>{@link #QUEUE_WITH_CORRUPT_BLOCKS} This is for blocks that are corrupt
* and for which there are no-non-corrupt copies (currently) available.
* The policy here is to keep those corrupt blocks replicated, but give
* blocks that are not corrupt higher priority.</li>
* </ol>
*/
class UnderReplicatedBlocks implements Iterable<Block> {
/**
* The total number of queues : {@value}
*/
static final int LEVEL = 5;
/**
* The queue with the highest priority: {@value}
*/
static final int QUEUE_HIGHEST_PRIORITY = 0;
/**
* The queue for blocks that are way below their expected value : {@value}
*/
static final int QUEUE_VERY_UNDER_REPLICATED = 1;
/**
* The queue for "normally" under-replicated blocks: {@value}
*/
static final int QUEUE_UNDER_REPLICATED = 2;
/**
* The queue for blocks that have the right number of replicas,
* but which the block manager felt were badly distributed: {@value}
*/
static final int QUEUE_REPLICAS_BADLY_DISTRIBUTED = 3;
/**
* The queue for corrupt blocks: {@value}
*/
static final int QUEUE_WITH_CORRUPT_BLOCKS = 4;
/**
* Create an object.
*/
UnderReplicatedBlocks() {
}
/**
* Empty the queues.
*/
void clear() throws IOException {
new LightWeightRequestHandler(
HDFSOperationType.DEL_ALL_UNDER_REPLICATED_BLKS) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
da.removeAll();
return null;
}
}.handle();
}
/**
* Return the total number of under replication blocks
*/
int size() throws IOException {
return (Integer) new LightWeightRequestHandler(
HDFSOperationType.COUNT_ALL_UNDER_REPLICATED_BLKS) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
return da.countAll();
}
}.handle();
}
/**
* Return the number of under replication blocks excluding corrupt blocks
*/
int getUnderReplicatedBlockCount() throws IOException {
return (Integer) new LightWeightRequestHandler(
HDFSOperationType.COUNT_UNDER_REPLICATED_BLKS_LESS_THAN_LVL4) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
return da.countLessThanALevel(QUEUE_WITH_CORRUPT_BLOCKS);
}
}.handle();
}
/**
* Return the number of corrupt blocks
*/
int getCorruptBlockSize() throws IOException {
return count(QUEUE_WITH_CORRUPT_BLOCKS);
}
/**
* Check if a block is in the neededReplication queue
*/
boolean contains(BlockInfo block)
throws StorageException, TransactionContextException {
return getUnderReplicatedBlock(block) != null;
}
/**
* Return the priority of a block
*
* @param block
* a under replicated block
* @param curReplicas
* current number of replicas of the block
* @param expectedReplicas
* expected number of replicas of the block
* @return the priority for the blocks, between 0 and ({@link #LEVEL}-1)
*/
private int getPriority(Block block, int curReplicas,
int decommissionedReplicas, int expectedReplicas) {
assert curReplicas >= 0 : "Negative replicas!";
if (curReplicas >= expectedReplicas) {
// Block has enough copies, but not enough racks
return QUEUE_REPLICAS_BADLY_DISTRIBUTED;
} else if (curReplicas == 0) {
// If there are zero non-decommissioned replicas but there are
// some decommissioned replicas, then assign them highest priority
if (decommissionedReplicas > 0) {
return QUEUE_HIGHEST_PRIORITY;
}
//all we have are corrupt blocks
return QUEUE_WITH_CORRUPT_BLOCKS;
} else if (curReplicas == 1) {
//only on replica -risk of loss
// highest priority
return QUEUE_HIGHEST_PRIORITY;
} else if ((curReplicas * 3) < expectedReplicas) {
//there is less than a third as many blocks as requested;
//this is considered very under-replicated
return QUEUE_VERY_UNDER_REPLICATED;
} else {
//add to the normal queue for under replicated blocks
return QUEUE_UNDER_REPLICATED;
}
}
/**
* add a block to a under replication queue according to its priority
*
* @param block
* a under replication block
* @param curReplicas
* current number of replicas of the block
* @param decomissionedReplicas
* the number of decommissioned replicas
* @param expectedReplicas
* expected number of replicas of the block
* @return true if the block was added to a queue.
*/
boolean add(BlockInfo block, int curReplicas, int decomissionedReplicas,
int expectedReplicas)
throws StorageException, TransactionContextException {
assert curReplicas >= 0 : "Negative replicas!";
int priLevel = getPriority(block, curReplicas, decomissionedReplicas,
expectedReplicas);
if (priLevel != LEVEL && add(block, priLevel)) {
if (NameNode.blockStateChangeLog.isDebugEnabled()) {
NameNode.blockStateChangeLog.debug(
"BLOCK* NameSystem.UnderReplicationBlock.add:" + block +
" has only " + curReplicas + " replicas and need " +
expectedReplicas +
" replicas so is added to neededReplications" +
" at priority level " + priLevel);
}
return true;
}
return false;
}
/**
* remove a block from a under replication queue
*/
boolean remove(BlockInfo block, int oldReplicas, int decommissionedReplicas,
int oldExpectedReplicas)
throws StorageException, TransactionContextException {
int priLevel = getPriority(block, oldReplicas, decommissionedReplicas,
oldExpectedReplicas);
return remove(block, priLevel);
}
/**
* Remove a block from the under replication queues.
* <p/>
* The priLevel parameter is a hint of which queue to query
* first: if negative or >= {@link #LEVEL} this shortcutting
* is not attmpted.
* <p/>
* If the block is not found in the nominated queue, an attempt is made to
* remove it from all queues.
* <p/>
* <i>Warning:</i> This is not a synchronized method.
*
* @param block
* block to remove
* @param priLevel
* expected privilege level
* @return true if the block was found and removed from one of the priority
* queues
*/
boolean remove(BlockInfo block, int priLevel)
throws StorageException, TransactionContextException {
UnderReplicatedBlock urb = getUnderReplicatedBlock(block);
if (priLevel >= 0 && priLevel < LEVEL && remove(urb)) {
if (NameNode.blockStateChangeLog.isDebugEnabled()) {
NameNode.blockStateChangeLog.debug(
"BLOCK* NameSystem.UnderReplicationBlock.remove: " +
"Removing block " + block + " from priority queue " +
urb.getLevel());
}
return true;
}
return false;
}
/**
* Recalculate and potentially update the priority level of a block.
* <p/>
* If the block priority has changed from before an attempt is made to
* remove it from the block queue. Regardless of whether or not the block
* is in the block queue of (recalculate) priority, an attempt is made
* to add it to that queue. This ensures that the block will be
* in its expected priority queue (and only that queue) by the end of the
* method call.
*
* @param block
* a under replicated block
* @param curReplicas
* current number of replicas of the block
* @param decommissionedReplicas
* the number of decommissioned replicas
* @param curExpectedReplicas
* expected number of replicas of the block
* @param curReplicasDelta
* the change in the replicate count from before
* @param expectedReplicasDelta
* the change in the expected replica count from before
*/
void update(BlockInfo block, int curReplicas, int decommissionedReplicas,
int curExpectedReplicas, int curReplicasDelta, int expectedReplicasDelta)
throws StorageException, TransactionContextException {
int oldReplicas = curReplicas - curReplicasDelta;
int oldExpectedReplicas = curExpectedReplicas - expectedReplicasDelta;
int curPri = getPriority(block, curReplicas, decommissionedReplicas,
curExpectedReplicas);
int oldPri = getPriority(block, oldReplicas, decommissionedReplicas,
oldExpectedReplicas);
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("UnderReplicationBlocks.update " +
block +
" curReplicas " + curReplicas +
" curExpectedReplicas " + curExpectedReplicas +
" oldReplicas " + oldReplicas +
" oldExpectedReplicas " + oldExpectedReplicas +
" curPri " + curPri +
" oldPri " + oldPri);
}
if (oldPri != LEVEL && oldPri != curPri) {
remove(block, oldPri);
}
if (curPri != LEVEL && add(block, curPri)) {
if (NameNode.blockStateChangeLog.isDebugEnabled()) {
NameNode.blockStateChangeLog.debug(
"BLOCK* NameSystem.UnderReplicationBlock.update:" + block +
" has only " + curReplicas + " replicas and needs " +
curExpectedReplicas +
" replicas so is added to neededReplications" +
" at priority level " + curPri);
}
}
}
/**
* Get a list of block lists to be replicated. The index of block lists
* represents its replication priority. Replication index will be tracked for
* each priority list separately in priorityToReplIdx map. Iterates through
* all priority lists and find the elements after replication index. Once the
* last priority lists reaches to end, all replication indexes will be set to
* 0 and start from 1st priority list to fulfill the blockToProces count.
*
* @param blocksToProcess
* - number of blocks to fetch from underReplicated blocks.
* @return Return a list of block lists to be replicated. The block list index
* represents its replication priority.
*/
private List<List<Block>> chooseUnderReplicatedBlocksInt(int blocksToProcess)
throws IOException {
// initialize data structure for the return value
List<List<Block>> blocksToReplicate = new ArrayList<List<Block>>(LEVEL);
for (int i = 0; i < LEVEL; i++) {
blocksToReplicate.add(new ArrayList<Block>());
}
if (size() == 0) { // There are no blocks to collect.
return blocksToReplicate;
}
List<Integer> priorityToReplIdx = getReplicationIndex();
List<List<Block>> priorityQueuestmp = createPrioriryQueue();
int blockCount = 0;
blocksToProcess = Math.min(blocksToProcess, size());
for (int priority = 0; priority < LEVEL; priority++) {
// Go through all blocks that need replications with current priority.
Integer replIndex = priorityToReplIdx.get(priority);
if (blockCount == blocksToProcess) {
break; // break if already expected blocks are obtained
}
int remainingblksToProcess = blocksToProcess - blockCount;
List<UnderReplicatedBlock> urbs =
getUnderReplicatedBlocks(priority, replIndex, remainingblksToProcess);
addBlocksInPriorityQueues(urbs, priorityQueuestmp);
List<Block> blks = priorityQueuestmp.get(priority);
blocksToReplicate.get(priority).addAll(blks);
blockCount += blks.size();
replIndex += blks.size();
if (count(priority) <= remainingblksToProcess && priority == LEVEL - 1) {
// reset all priorities replication index to 0 because there is no
// recently added blocks in any list.
for (int i = 0; i < LEVEL; i++) {
priorityToReplIdx.set(i, 0);
}
break;
}
priorityToReplIdx.set(priority, replIndex);
}
setReplicationIndex(priorityToReplIdx);
return blocksToReplicate;
}
/**
* returns an iterator of all blocks in a given priority queue
*/
BlockIterator iterator(final int level) {
try {
return (BlockIterator) new HopsTransactionalRequestHandler(
HDFSOperationType.UNDER_REPLICATED_BLKS_ITERATOR) {
@Override
public void acquireLock(TransactionLocks locks) throws IOException {
}
@Override
public Object performTask() throws StorageException, IOException {
return new BlockIterator(fillPriorityQueues(level), level);
}
}.handle();
} catch (IOException ex) {
BlockManager.LOG
.error("Error while filling the priorityQueues from db", ex);
return null;
}
}
/**
* return an iterator of all the under replication blocks
*/
@Override
public BlockIterator iterator() {
try {
return (BlockIterator) new HopsTransactionalRequestHandler(
HDFSOperationType.UNDER_REPLICATED_BLKS_ITERATOR) {
@Override
public void acquireLock(TransactionLocks locks) throws IOException {
}
@Override
public Object performTask() throws StorageException, IOException {
return new BlockIterator(fillPriorityQueues());
}
}.handle();
} catch (IOException ex) {
BlockManager.LOG
.error("Error while filling the priorityQueues from db", ex);
return null;
}
}
/**
* An iterator over blocks.
*/
class BlockIterator implements Iterator<Block> {
private int level;
private boolean isIteratorForLevel = false;
private final List<Iterator<Block>> iterators =
new ArrayList<Iterator<Block>>();
/**
* Construct an iterator over all queues.
*/
private BlockIterator(List<List<Block>> priorityQueuestmp) {
level = 0;
synchronized (iterators) {
for (int i = 0; i < LEVEL; i++) {
iterators.add(priorityQueuestmp.get(i).iterator());
}
}
}
/**
* Constrict an iterator for a single queue level
*
* @param l
* the priority level to iterate over
*/
private BlockIterator(List<List<Block>> priorityQueuestmp, int l) {
level = l;
isIteratorForLevel = true;
synchronized (iterators) {
iterators.add(priorityQueuestmp.get(level).iterator());
}
}
private void update() {
if (isIteratorForLevel) {
return;
}
synchronized (iterators) {
while (level < LEVEL - 1 && !iterators.get(level).hasNext()) {
level++;
}
}
}
@Override
public Block next() {
if (isIteratorForLevel) {
synchronized (iterators) {
return iterators.get(0).next();
}
}
update();
synchronized (iterators) {
return iterators.get(level).next();
}
}
@Override
public boolean hasNext() {
if (isIteratorForLevel) {
synchronized (iterators) {
return iterators.get(0).hasNext();
}
}
update();
synchronized (iterators) {
return iterators.get(level).hasNext();
}
}
@Override
public void remove() {
if (isIteratorForLevel) {
synchronized (iterators) {
iterators.get(0).remove();
}
} else {
synchronized (iterators) {
iterators.get(level).remove();
}
}
}
int getPriority() {
return level;
}
}
/**
* This method is to decrement the replication index for the given priority
*
* @param priority
* - int priority level
*/
public void decrementReplicationIndex(int priority)
throws StorageException, TransactionContextException {
List<Integer> priorityToReplIdx = getReplicationIndex();
Integer replIdx = priorityToReplIdx.get(priority);
replIdx = replIdx <= 0 ? 0 : (replIdx - 1);
priorityToReplIdx.set(priority, replIdx);
setReplicationIndex(priorityToReplIdx);
}
public List<List<Block>> chooseUnderReplicatedBlocks(
final int blocksToProcess) throws IOException {
return (List<List<Block>>) new HopsTransactionalRequestHandler(
HDFSOperationType.CHOOSE_UNDER_REPLICATED_BLKS) {
@Override
public void acquireLock(TransactionLocks locks) throws IOException {
LockFactory lf = LockFactory.getInstance();
locks.add(lf.getVariableLock(Variable.Finder.ReplicationIndex,
TransactionLockTypes.LockType.WRITE));
}
@Override
public Object performTask() throws StorageException, IOException {
return chooseUnderReplicatedBlocksInt(blocksToProcess);
}
}.handle();
}
private boolean remove(UnderReplicatedBlock urb)
throws StorageException, TransactionContextException {
if (urb != null) {
removeUnderReplicatedBlock(urb);
return true;
}
return false;
}
// return true if it does not exist other wise return false
private boolean add(BlockInfo block, int priLevel)
throws StorageException, TransactionContextException {
UnderReplicatedBlock urb = getUnderReplicatedBlock(block);
if (urb == null) {
addUnderReplicatedBlock(
new UnderReplicatedBlock(priLevel, block.getBlockId(),
block.getInodeId()));
return true;
}
return false;
}
private List<List<Block>> fillPriorityQueues() throws IOException {
return fillPriorityQueues(-1);
}
private List<List<Block>> fillPriorityQueues(int level) throws IOException {
List<List<Block>> priorityQueuestmp = createPrioriryQueue();
List<UnderReplicatedBlock> allUrb = getUnderReplicatedBlocks(level);
if (!allUrb.isEmpty()) {
addBlocksInPriorityQueues(allUrb, priorityQueuestmp);
}
return priorityQueuestmp;
}
private List<List<Block>> createPrioriryQueue() {
List<List<Block>> priorityQueuestmp = new ArrayList<List<Block>>();
for (int i = 0; i < LEVEL; i++) {
priorityQueuestmp.add(new ArrayList<Block>());
}
return priorityQueuestmp;
}
private List<UnderReplicatedBlock> getUnderReplicatedBlocks(final int level)
throws IOException {
return (List<UnderReplicatedBlock>) new LightWeightRequestHandler(
HDFSOperationType.GET_ALL_UNDER_REPLICATED_BLKS) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
if (level == -1) {
return da.findAll();
} else {
return da.findByLevel(level);
}
}
}.handle();
}
private List<UnderReplicatedBlock> getUnderReplicatedBlocks(final int level,
final int offset, final int count) throws IOException {
return (List<UnderReplicatedBlock>) new LightWeightRequestHandler(
HDFSOperationType.GET_UNDER_REPLICATED_BLKS_By_LEVEL_LIMITED) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
return da.findByLevel(level, offset, count);
}
}.handle();
}
private void addBlocksInPriorityQueues(
final List<UnderReplicatedBlock> allUrb,
final List<List<Block>> priorityQueuestmp) throws IOException {
final long[] blockIds = new long[allUrb.size()];
final int[] inodeIds = new int[allUrb.size()];
final HashMap<Long, UnderReplicatedBlock> allUrbHashMap =
new HashMap<Long, UnderReplicatedBlock>();
for (int i = 0; i < allUrb.size(); i++) {
UnderReplicatedBlock b = allUrb.get(i);
blockIds[i] = b.getBlockId();
inodeIds[i] = b.getInodeId();
allUrbHashMap.put(b.getBlockId(), b);
}
// use lightweight transaction handler here and it should work
new LightWeightRequestHandler(HDFSOperationType.GET_BLOCKS) {
@Override
public Object performTask() throws StorageException, IOException {
BlockInfoDataAccess bda = (BlockInfoDataAccess) HdfsStorageFactory
.getDataAccess(BlockInfoDataAccess.class);
List<BlockInfo> blks = bda.findByIds(blockIds, inodeIds);
for (BlockInfo blk : blks) {
UnderReplicatedBlock urb = allUrbHashMap.remove(blk.getBlockId());
assert urb.getInodeId() == blk.getInodeId();
priorityQueuestmp.get(urb.getLevel()).add(blk);
}
//HOP[M]: allUrb should contains the list of underreplicatedblocks that doesn't have any block attached to
// so it's safe to delete these blocks without taking anylocks
Collection<UnderReplicatedBlock> toRemove = allUrbHashMap.values();
if (!toRemove.isEmpty()) {
UnderReplicatedBlockDataAccess uda =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
uda.prepare(toRemove, Collections.EMPTY_LIST, Collections.EMPTY_LIST);
}
return null;
}
}.handle();
}
int count(final int level) throws IOException {
return (Integer) new LightWeightRequestHandler(
HDFSOperationType.COUNT_UNDER_REPLICATED_BLKS_AT_LVL) {
@Override
public Object performTask() throws StorageException, IOException {
UnderReplicatedBlockDataAccess da =
(UnderReplicatedBlockDataAccess) HdfsStorageFactory
.getDataAccess(UnderReplicatedBlockDataAccess.class);
return da.countByLevel(level);
}
}.handle();
}
private UnderReplicatedBlock getUnderReplicatedBlock(BlockInfo blk)
throws StorageException, TransactionContextException {
return EntityManager
.find(UnderReplicatedBlock.Finder.ByBlockIdAndINodeId, blk.getBlockId(),
blk.getInodeId());
}
private void addUnderReplicatedBlock(UnderReplicatedBlock urb)
throws StorageException, TransactionContextException {
EntityManager.add(urb);
}
private void removeUnderReplicatedBlock(UnderReplicatedBlock urb)
throws StorageException, TransactionContextException {
EntityManager.remove(urb);
}
private List<Integer> getReplicationIndex()
throws StorageException, TransactionContextException {
return HdfsVariables.getReplicationIndex();
}
private void setReplicationIndex(List<Integer> replicationIndex)
throws StorageException, TransactionContextException {
HdfsVariables.setReplicationIndex(replicationIndex);
}
}