package io.blobkeeper.cluster.service;
/*
* Copyright (C) 2015-2016 by Denis M. Gabaydulin
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import io.blobkeeper.cluster.configuration.ClusterPropertiesConfiguration;
import io.blobkeeper.cluster.domain.CustomMessageHeader;
import io.blobkeeper.cluster.domain.DifferenceInfo;
import io.blobkeeper.cluster.domain.MerkleTreeInfo;
import io.blobkeeper.cluster.domain.Node;
import io.blobkeeper.common.logging.MdcContext;
import io.blobkeeper.common.util.LeafNode;
import io.blobkeeper.common.util.MdcUtils;
import io.blobkeeper.common.util.MerkleTree;
import io.blobkeeper.file.domain.File;
import io.blobkeeper.file.domain.ReplicationFile;
import io.blobkeeper.file.service.DiskService;
import io.blobkeeper.file.service.FileListService;
import io.blobkeeper.index.domain.CacheKey;
import io.blobkeeper.index.domain.Partition;
import io.blobkeeper.index.service.IndexCacheService;
import io.blobkeeper.index.service.IndexService;
import io.blobkeeper.index.util.IndexUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jgroups.*;
import org.jgroups.blocks.MethodCall;
import org.jgroups.blocks.RequestOptions;
import org.jgroups.blocks.ResponseMode;
import org.jgroups.blocks.RpcDispatcher;
import org.jgroups.blocks.locking.LockService;
import org.jgroups.conf.ClassConfigurator;
import org.jgroups.util.Util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Method;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Random;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import static com.google.common.collect.Iterables.toArray;
import static com.jayway.awaitility.Awaitility.await;
import static com.jayway.awaitility.Duration.FIVE_HUNDRED_MILLISECONDS;
import static io.blobkeeper.cluster.domain.CustomMessageHeader.CUSTOM_MESSAGE_HEADER;
import static io.blobkeeper.cluster.domain.Role.MASTER;
import static io.blobkeeper.cluster.domain.Role.SLAVE;
import static io.blobkeeper.common.logging.MdcContext.SRC_NODE;
import static io.blobkeeper.common.util.GuavaCollectors.toImmutableList;
import static io.blobkeeper.common.util.MdcUtils.setCurrentContext;
import static io.blobkeeper.common.util.MerkleTree.MAX_LEVEL;
import static io.blobkeeper.common.util.Utils.createEmptyTree;
import static io.blobkeeper.file.util.FileUtils.buildMerkleTree;
import static java.lang.System.currentTimeMillis;
import static java.util.Optional.ofNullable;
import static java.util.concurrent.CompletableFuture.allOf;
import static java.util.concurrent.CompletableFuture.runAsync;
import static org.jgroups.blocks.ResponseMode.GET_FIRST;
import static org.jgroups.jmx.JmxConfigurator.registerChannel;
import static org.jgroups.jmx.JmxConfigurator.unregisterChannel;
import static org.jgroups.util.Util.createConcurrentMap;
import static org.jgroups.util.Util.getMBeanServer;
@Singleton
public class ClusterMembershipServiceImpl extends ReceiverAdapter implements ClusterMembershipService {
private static final Logger log = LoggerFactory.getLogger(ClusterMembershipServiceImpl.class);
private static final Short SET_MASTER = 0x1;
private static final Short GET_MASTER = 0x2;
private static final Short REMOVE_MASTER = 0x4;
private static final Short GET_NODE = 0x5;
private static final Short GET_TREE_INFO = 0x6;
private static final Short GET_TREE_DIFF_NODE = 0x7;
private static final Short DELETE_PARTITION_FILE = 0x8;
private static final String CLUSTER_NAME = "blobkeeper_cluster";
private static final String MASTER_LOCK = "master_lock";
@Inject
private FileListService fileListService;
@Inject
private LockService lockService;
@Inject
private ReplicationHandlerService replicationHandlerService;
@Inject
private RepairService repairService;
@Inject
private MasterChangedListener listener;
@Inject
private ClusterPropertiesConfiguration configuration;
@Inject
private ReplicationClientService replicationClient;
@Inject
private IndexService indexService;
@Inject
private IndexUtils indexUtils;
@Inject
private DiskService diskService;
@Inject
private IndexCacheService indexCacheService;
private final Random random = new Random();
private JChannel channel;
private volatile Node self;
private volatile Node master;
private RpcDispatcher dispatcher = null;
private final ExecutorService masterSelectorAndRepairExecutor = Executors.newFixedThreadPool(
16,
new ThreadFactoryBuilder()
.setDaemon(true)
.setNameFormat("RepairWorker-%d")
.build()
);
private static final Map<Short, Method> methods = createConcurrentMap(16);
static {
try {
methods.put(SET_MASTER, ClusterMembershipServiceImpl.class.getMethod("_setMaster", Address.class));
methods.put(GET_MASTER, ClusterMembershipServiceImpl.class.getMethod("_getMaster"));
methods.put(REMOVE_MASTER, ClusterMembershipServiceImpl.class.getMethod("_removeMaster"));
methods.put(GET_NODE, ClusterMembershipServiceImpl.class.getMethod("_getNode"));
methods.put(GET_TREE_INFO, ClusterMembershipServiceImpl.class.getMethod("_getMerkleTreeInfo", int.class, int.class));
methods.put(GET_TREE_DIFF_NODE, ClusterMembershipServiceImpl.class.getMethod("_getDifference", int.class, int.class));
methods.put(DELETE_PARTITION_FILE, ClusterMembershipServiceImpl.class.getMethod("_deletePartitionFile", int.class, int.class));
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
ClassConfigurator.add(CUSTOM_MESSAGE_HEADER, CustomMessageHeader.class);
}
@Override
public void start(@NotNull String name) {
try {
channel = new JChannel(ClusterMembershipServiceImpl.class.getClassLoader().getResourceAsStream(configuration.getClusterConfig()));
channel.setDiscardOwnMessages(true);
channel.setName(name);
dispatcher = new RpcDispatcher(channel, null, this, this);
dispatcher.setMethodLookup(methods::get);
dispatcher.setMessageListener(this);
channel.connect(CLUSTER_NAME);
channel.getState(null, 10000);
setCurrentContext(new MdcContext(ImmutableMap.of(SRC_NODE, getSelfNode().toString())));
log.info("Node is started");
registerChannel(channel, getMBeanServer(), name);
} catch (Exception e) {
log.error("Can't create channel", e);
throw new RuntimeException(e);
}
}
@Override
public void stop() {
await().forever().pollInterval(FIVE_HUNDRED_MILLISECONDS).until(
() -> {
log.trace("Waiting for repair");
return !repairService.isRepairInProgress();
});
dispatcher.stop();
try {
unregisterChannel(getMBeanServer(), channel.getName());
} catch (Exception e) {
log.error("Can't unregister channel", e);
}
channel.close();
log.debug("Node is stopped");
self = null;
master = null;
}
@Override
public JChannel getChannel() {
return channel;
}
@Override
public Optional<Node> getMaster() {
return ofNullable(master);
}
@Override
public void setMaster(@NotNull Node node) {
_setMaster(node.getAddress());
}
@Override
public boolean trySetMaster(@NotNull Address newMaster) {
// TODO: change executor
List<CompletableFuture<Void>> setters = getNodes().stream()
.map(node -> runAsync(() -> setMaster(node.getAddress(), newMaster)))
.collect(toImmutableList());
allOf(toArray(setters, CompletableFuture.class))
.join();
return true;
}
@Override
public Node getSelfNode() {
if (self == null) {
setSelfNode(new Node(SLAVE, channel.getAddress(), currentTimeMillis()));
log.info("Self updated");
}
return self;
}
@Override
public List<Node> getNodes() {
Optional<Node> master = getMaster();
String masterAddress = master
.map(Node::getAddress)
.map(Address::toString)
.orElse(null);
// FIXME: find a better way for optimization of getNode()
return channel.getView().getMembers().stream()
.map(address -> address.toString().equals(masterAddress) ? new Node(MASTER, address, 0L) : new Node(SLAVE, address, 0L))
.collect(toImmutableList());
}
@Override
public DifferenceInfo getDifference(@NotNull MerkleTreeInfo treeInfo) {
// get actual merkle tree
MerkleTreeInfo localTreeInfo = _getMerkleTreeInfo(treeInfo.getDisk(), treeInfo.getPartition());
List<LeafNode> difference = MerkleTree.difference(treeInfo.getTree(), localTreeInfo.getTree());
DifferenceInfo differenceInfo = new DifferenceInfo();
differenceInfo.setDisk(treeInfo.getDisk());
differenceInfo.setPartition(treeInfo.getPartition());
differenceInfo.setDifference(difference);
return differenceInfo;
}
@Override
public boolean isMaster() {
return getSelfNode().equals(master);
}
@Override
public boolean tryRemoveMaster() {
// TODO: change executor
List<CompletableFuture<Void>> removers = getNodes().stream()
.map(node -> runAsync(() -> removeMaster(node.getAddress())))
.collect(toImmutableList());
allOf(toArray(removers, CompletableFuture.class))
.join();
return true;
}
@Override
public void deletePartitionFile(int disk, int partition) {
List<CompletableFuture<Void>> partitionDeleteWorkers = getNodes().stream()
.map(node -> runAsync(() -> deletePartitionFile(node.getAddress(), disk, partition)))
.collect(toImmutableList());
allOf(toArray(partitionDeleteWorkers, CompletableFuture.class))
.join();
}
@Override
public Optional<Node> getNodeForRepair(boolean active) {
if (active) {
return getMaster()
.filter(node -> !node.equals(getSelfNode()));
} else {
List<Node> nodes = getNodes();
return nodes.stream()
.filter(node -> !node.equals(getSelfNode()))
.skip(nodes.size() > 1 ? random.nextInt(nodes.size() - 1) : 0)
.findFirst();
}
}
@Override
public void setMaster(@NotNull Address node, @NotNull Address newMaster) {
if (getSelfNode().getAddress().equals(node)) {
_setMaster(newMaster);
return;
}
try {
dispatcher.callRemoteMethod(
node,
new MethodCall(SET_MASTER, newMaster),
new RequestOptions(GET_FIRST, 1000L)
);
} catch (Exception e) {
log.error("Can't call method " + SET_MASTER + " on remote node " + node, e);
}
}
@Override
public Node getMaster(@NotNull Address node) {
log.debug("Getting master");
if (getSelfNode().getAddress().equals(node)) {
return _getMaster();
}
try {
return dispatcher.callRemoteMethod(
node,
new MethodCall(GET_MASTER),
new RequestOptions(ResponseMode.GET_FIRST, 1000L));
} catch (Exception e) {
log.error("Can't call method " + GET_MASTER + " on remote node " + node, e);
}
return null;
}
@Override
public Node getNode(@NotNull Address node) {
log.debug("Getting node object");
if (getSelfNode().getAddress().equals(node)) {
return _getNode();
}
try {
return dispatcher.callRemoteMethod(
node,
new MethodCall(GET_NODE),
new RequestOptions(ResponseMode.GET_FIRST, 1000L));
} catch (Exception e) {
log.error("Can't call method " + GET_NODE + " on remote node " + node, e);
}
return null;
}
@NotNull
@Override
public MerkleTreeInfo getMerkleTreeInfo(@NotNull Address node, int disk, int partition) {
log.debug("Getting merkle tree info {} for disk {}", partition, disk);
if (getSelfNode().getAddress().equals(node)) {
return _getMerkleTreeInfo(disk, partition);
}
try {
return dispatcher.callRemoteMethod(
node,
new MethodCall(GET_TREE_INFO, disk, partition),
new RequestOptions(ResponseMode.GET_FIRST, 5 * 60 * 1000L)); // 5 minutes
} catch (Exception e) {
log.error("Can't call method " + GET_TREE_INFO + " on remote node " + node, e);
}
MerkleTreeInfo merkleTreeInfo = new MerkleTreeInfo();
merkleTreeInfo.setDisk(disk);
merkleTreeInfo.setPartition(partition);
return merkleTreeInfo;
}
@Nullable
@Override
public DifferenceInfo getDifference(@NotNull Address node, int disk, int partition) {
log.debug("Getting merkle tree diff {} for disk {}", partition, disk);
if (getSelfNode().getAddress().equals(node)) {
return _getDifference(disk, partition);
}
try {
return dispatcher.callRemoteMethod(
node,
new MethodCall(GET_TREE_DIFF_NODE, disk, partition),
new RequestOptions(ResponseMode.GET_FIRST, 5 * 60 * 1000L)); // 5 minutes
} catch (Exception e) {
log.error("Can't call method " + GET_TREE_DIFF_NODE + " on remote node " + node, e);
}
return null;
}
@Override
public void removeMaster(@NotNull Address node) {
if (getSelfNode().getAddress().equals(node)) {
_removeMaster();
return;
}
try {
dispatcher.callRemoteMethod(
node,
new MethodCall(REMOVE_MASTER),
new RequestOptions(ResponseMode.GET_NONE, 1000L)
);
} catch (Exception e) {
log.error("Can't call method " + REMOVE_MASTER + " on remote node " + node, e);
}
}
@Override
public void deletePartitionFile(@NotNull Address node, int disk, int partition) {
if (getSelfNode().getAddress().equals(node)) {
_deletePartitionFile(disk, partition);
return;
}
try {
dispatcher.callRemoteMethod(
node,
new MethodCall(DELETE_PARTITION_FILE, disk, partition),
new RequestOptions(GET_FIRST, 10000L)
);
} catch (Exception e) {
log.error("Can't call method " + DELETE_PARTITION_FILE + " on remote node " + node, e);
}
}
@Override
public void getState(OutputStream output) throws Exception {
Util.objectToStream(master, new DataOutputStream(output));
}
@Override
public void setState(InputStream input) throws Exception {
master = (Node) Util.objectFromStream(new DataInputStream(input));
}
@Override
public void receive(Message message) {
if (log.isTraceEnabled()) {
log.trace("Message received:" + message);
}
CustomMessageHeader customMessageHeader;
try {
customMessageHeader = (CustomMessageHeader) message.getHeader(CUSTOM_MESSAGE_HEADER);
} catch (ClassCastException e) {
throw new IllegalArgumentException("Can't find replication header!", e);
}
if (null == customMessageHeader) {
throw new IllegalArgumentException("Can't find replication header!");
}
switch (customMessageHeader.getCommand()) {
case FILE:
handleReplicatedFile(message);
break;
case REPLICATION_REQUEST:
masterSelectorAndRepairExecutor.submit(new ReplicationRequestHandler(message));
break;
case CACHE_INVALIDATE_REQUEST:
handleCacheInvalidate(message);
break;
default:
throw new IllegalArgumentException(String.format("Do not know what to do with %s", customMessageHeader.getCommand()));
}
}
@Override
public void viewAccepted(final View view) {
super.viewAccepted(view);
log.info("Nodes list is changed {} for node {}, creator {}", view, getSelfNode(), view.getCreator());
masterSelectorAndRepairExecutor.submit(new RepairTask(view));
}
/**
* RPC methods impl.
*/
public void _setMaster(Address newMaster) {
log.info("Set new master {}", newMaster);
Node oldMaster = master;
master = new Node(MASTER, newMaster, currentTimeMillis());
if (getSelfNode().getAddress().equals(newMaster)) {
setSelfNode(master);
log.info("Self updated");
} else {
setSelfNode(new Node(SLAVE, getSelfNode().getAddress(), currentTimeMillis()));
}
listener.onMasterChanged(self, oldMaster, master);
}
public Node _getMaster() {
return master;
}
public Node _getNode() {
return getSelfNode();
}
public void _removeMaster() {
log.info("Remove master");
Node oldMaster = master;
master = null;
setSelfNode(new Node(SLAVE, getSelfNode().getAddress(), currentTimeMillis()));
listener.onMasterChanged(self, oldMaster, master);
}
@NotNull
public MerkleTreeInfo _getMerkleTreeInfo(int disk, int partition) {
MerkleTreeInfo merkleTreeInfo = new MerkleTreeInfo();
merkleTreeInfo.setDisk(disk);
merkleTreeInfo.setPartition(partition);
Partition partitionObject = new Partition(disk, partition);
File file = null;
try {
file = fileListService.getFile(disk, partition);
if (null == file) {
merkleTreeInfo.setTree(createEmptyTree(indexService.getMinMaxRange(partitionObject), MAX_LEVEL));
return merkleTreeInfo;
} else {
merkleTreeInfo.setTree(buildMerkleTree(indexService, file, partitionObject));
return merkleTreeInfo;
}
} finally {
if (null != file) {
try {
file.close();
} catch (Exception ignored) {
}
}
}
}
public DifferenceInfo _getDifference(int disk, int partition) {
MerkleTree expectedTree = indexUtils.buildMerkleTree(new Partition(disk, partition));
MerkleTreeInfo expected = new MerkleTreeInfo();
expected.setDisk(disk);
expected.setPartition(partition);
expected.setTree(expectedTree);
return getDifference(expected);
}
public void _deletePartitionFile(int disk, int partition) {
log.info("Delete partition file: {} {}", disk, partition);
diskService.deleteFile(new Partition(disk, partition));
}
private void setSelfNode(Node node) {
self = node;
setCurrentContext(new MdcContext(ImmutableMap.of(SRC_NODE, getSelfNode().toString())));
}
private void handleNodeChanging() {
lockService.setChannel(channel);
boolean acquired = false;
Lock lock = lockService.getLock(MASTER_LOCK);
try {
acquired = lock.tryLock(5000L, TimeUnit.MILLISECONDS);
if (!acquired) {
return;
}
if (configuration.isMaster()) {
log.info("Master comes from configuration");
getMaster().ifPresent(
master -> {
if (!getSelfNode().equals(master)) {
throw new IllegalStateException("Master already exists!");
}
}
);
setMaster(getSelfNode());
}
log.trace("Locked {}");
View view = channel.getView();
Optional<Node> currentMaster = getMaster();
log.info("Current master is {}", currentMaster);
boolean masterIsAvailable = isCurrentMasterAvailable(view, currentMaster);
// just added slave
if (masterIsAvailable) {
// event could be handle on any node
// TODO: optimize, send master only to the added node
trySetMaster(currentMaster.get().getAddress());
} else {
ofNullable(master).ifPresent(
master -> {
// remove master on current node
log.warn("Master {} is not available", master);
this.removeMaster(getSelfNode().getAddress());
}
);
}
} catch (Exception e) {
log.error("Can't select master", e);
} finally {
if (acquired) {
lock.unlock();
log.trace("Unlocked");
}
}
}
private boolean isCurrentMasterAvailable(View view, Optional<Node> currentMaster) {
return currentMaster
.map(master -> view.getMembers().contains(master.getAddress()))
.orElse(false);
}
private void handleReplicatedFile(Message message) {
try {
Object file = message.getObject();
if (file instanceof ReplicationFile) {
replicationHandlerService.handleReplicated((ReplicationFile) message.getObject());
} else {
log.error("Do not know what to do with {}", file.getClass());
}
} catch (Exception e) {
log.error("Can't replicate block", e);
}
}
private void handleCacheInvalidate(Message message) {
try {
Object cacheKey = message.getObject();
if (cacheKey instanceof CacheKey) {
indexCacheService.remove((CacheKey) cacheKey);
}
} catch (Exception e) {
log.error("Can't invalidate cache", e);
}
}
private class RepairTask implements Runnable {
private final View view;
private RepairTask(View view) {
this.view = view;
}
@Override
public void run() {
try {
setCurrentContext(new MdcContext(ImmutableMap.of(SRC_NODE, getSelfNode().toString())));
handleNodeChanging();
log.info("Current view is {}", getChannel().getView());
if (!getMaster().isPresent()) {
log.info("There is no master, skip repairing");
return;
}
if (isRepairRequired()) {
log.info("Repairing started");
repairService.repairActive();
}
} catch (Throwable e) {
log.error("Can't select master", e);
} finally {
MdcUtils.clearCurrentContext();
}
}
private boolean isRepairRequired() {
return view.getMembers().size() > 1;
}
}
private class ReplicationRequestHandler implements Runnable {
private final Message message;
public ReplicationRequestHandler(Message message) {
this.message = message;
}
@Override
public void run() {
try {
setCurrentContext(new MdcContext(ImmutableMap.of(SRC_NODE, getSelfNode().toString())));
replicationClient.replicate((DifferenceInfo) message.getObject(), message.getSrc());
} catch (Exception e) {
log.error("Can't replicate file {}", message.getObject(), e);
} finally {
MdcUtils.clearCurrentContext();
}
}
}
}