/*
* Copyright (c) 2017 Strapdata (http://www.strapdata.com)
* Contains some code from Elasticsearch (http://www.elastic.co)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elassandra.cluster.routing;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.locator.AbstractReplicationStrategy;
import org.apache.cassandra.locator.TokenMetadata;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.FBUtilities;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNode.DiscoveryNodeStatus;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.index.shard.ShardId;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multimap;
/**
* Only support Murmur3 Long Token.
* SearchStrategy is per index
* SearchStrategy.Router is updated each time node join/leave/start/stop the cluster.
* SearchStrategy.Router.Route is per query route.
* @author vroyer
*
*/
public abstract class AbstractSearchStrategy {
public static ESLogger logger = Loggers.getLogger(AbstractSearchStrategy.class);
public static final Collection<Range<Token>> EMPTY_RANGE_TOKEN_LIST = ImmutableList.<Range<Token>>of();
public static final Token TOKEN_MIN = new LongToken(Long.MIN_VALUE);
public static final Token TOKEN_MAX = new LongToken(Long.MAX_VALUE);
public static final Range<Token> FULL_RANGE_TOKEN = new Range<Token>(new LongToken(Long.MIN_VALUE), new LongToken(Long.MAX_VALUE));
public abstract Router newRouter(final String index, final String ksName, final Map<UUID, ShardRoutingState> shardStates, final ClusterState clusterState);
// per index router, updated on each cassandra ring change.
public abstract class Router {
final String index;
final String ksName;
final long version;
final DiscoveryNode localNode;
final Map<UUID, ShardRoutingState> shardStates;
protected Multimap<Token,DiscoveryNode> tokenToNodes = ArrayListMultimap.create();
protected Map<DiscoveryNode, BitSet> greenShards; // available node to bitset of ranges => started primary.
protected Map<DiscoveryNode, BitSet> redShards; // unavailable node to bitset of orphan ranges => unassigned primary
protected List<DiscoveryNode> yellowShards; // unassigned replica
protected List<Token> tokens;
protected boolean isConsistent = true;
protected final TokenMetadata metadata;
protected final AbstractReplicationStrategy strategy;
public Router(final String index, final String ksName, final Map<UUID, ShardRoutingState> shardStates, final ClusterState clusterState, boolean includeReplica)
{
this.index = index;
this.ksName = ksName;
this.version = clusterState.version();
this.localNode = clusterState.nodes().localNode();
this.shardStates = shardStates;
if (Keyspace.isInitialized() && StorageService.instance.isJoined()) {
// only available when keyspaces are initialized and node joined
//this.rangeToEndpointsMap = StorageService.instance.getRangeToAddressMapInLocalDC(ksName);
this.strategy = Keyspace.open(ksName).getReplicationStrategy();
this.metadata = StorageService.instance.getTokenMetadata().cloneOnlyTokenMap();
for(DiscoveryNode node : clusterState.nodes()) {
for(Token token : this.metadata.getTokens(node.getInetAddress()))
this.tokenToNodes.put(token, node);
}
} else {
this.strategy = null;
this.metadata = null;
}
this.tokens = new ArrayList<Token>(this.tokenToNodes.keys());
this.tokens.add(TOKEN_MAX);
Collections.sort(tokens);
if (logger.isTraceEnabled())
logger.trace("index=[{}] keyspace=[{}] ordered tokens={}",index, ksName, this.tokens);
int i=0;
this.greenShards = new HashMap<DiscoveryNode, BitSet>();
for(Token token: tokens) {
if (TOKEN_MIN.equals(token))
continue;
// greenshard = available node -> token range bitset,
boolean orphanRange = true;
for(InetAddress endpoint : (this.metadata == null) ? Collections.singletonList(localNode.getInetAddress()) : this.strategy.calculateNaturalEndpoints(token, this.metadata)) {
UUID uuid = StorageService.instance.getHostId(endpoint);
DiscoveryNode node = (uuid == null) ? clusterState.nodes().findByInetAddress(endpoint) : clusterState.nodes().get(uuid.toString());
if (node != null && node.status() == DiscoveryNode.DiscoveryNodeStatus.ALIVE) {
if (ShardRoutingState.STARTED.equals(shardStates.get(node.uuid()))) {
orphanRange = false;
BitSet bs = greenShards.get(node);
if (bs == null) {
bs = new BitSet(tokens.size() - 1);
greenShards.put(node, bs);
}
bs.set(i);
if (!includeReplica)
break;
}
}
}
// redshards = unavailable node->token range bitset,
if (orphanRange) {
isConsistent = false;
if (redShards == null)
redShards = new HashMap<DiscoveryNode, BitSet>();
for(DiscoveryNode node : tokenToNodes.get(token)) {
BitSet bs = redShards.get(node);
if (bs == null) {
bs = new BitSet(tokens.size() - 1);
redShards.put(node, bs);
}
bs.set(i);
}
}
i++;
}
// yellow shards = unavailable nodes hosting token range available somewhere else in greenShards.
for(DiscoveryNode node : clusterState.nodes()) {
if (!this.greenShards.containsKey(node) && (this.redShards == null || !this.redShards.containsKey(node))) {
if (this.yellowShards == null) {
this.yellowShards = new ArrayList<DiscoveryNode>();
}
this.yellowShards.add(node);
}
}
if (logger.isTraceEnabled())
logger.trace("index=[{}] keyspace=[{}] isConsistent={} greenShards={} redShards={} yellowShards={}",index, ksName, this.isConsistent, this.greenShards, this.redShards, this.yellowShards);
}
public abstract Route newRoute(@Nullable String preference, TransportAddress src);
public boolean isConsistent() {
return this.isConsistent;
}
public ShardRoutingState getShardRoutingState(DiscoveryNode node) {
ShardRoutingState srs = this.shardStates.get(node.uuid());
return (srs==null) ? ShardRoutingState.UNASSIGNED : srs;
}
public Collection<Range<Token>> getTokenRanges(BitSet bs) {
List<Range<Token>> l = new ArrayList<Range<Token>>();
int i = 0;
while (i >= 0 && i < bs.length()) {
int left = bs.nextSetBit(i);
int right = bs.nextClearBit(left);
l.add(new Range<Token>( (left == 0) ? TOKEN_MIN : tokens.get(left -1), tokens.get(right - 1)));
i = right;
}
logger.trace("tokens={} bitset={} ranges={}", tokens, bs, l);
return l;
}
public abstract class Route {
List<IndexShardRoutingTable> shardRouting = null;
public Route() {
shardRouting = buildShardRouting();
}
/**
* Should returns selected shards token range bitset covering 100% of the cassandra ring.
* @return
*/
public abstract Map<DiscoveryNode, BitSet> selectedShards();
public List<IndexShardRoutingTable> getShardRouting() {
return this.shardRouting;
}
List<IndexShardRoutingTable> buildShardRouting() {
List<IndexShardRoutingTable> isrt = new ArrayList<IndexShardRoutingTable>(selectedShards().size() + ((Router.this.redShards!=null) ? Router.this.redShards.size() : 0) );
int i = 1;
boolean todo = true;
for(DiscoveryNode node : selectedShards().keySet()) {
int shardId = (localNode.id().equals(node.id())) ? 0 : i;
// started primary shards (green)
ShardRouting primaryShardRouting = new ShardRouting(index, shardId, node.id(), true,
ShardRoutingState.STARTED, version, null,
Router.this.getTokenRanges(selectedShards().get(node)));
if (todo && Router.this.yellowShards != null) {
// add all unassigned remote replica shards (yellow) on the first IndexShardRoutingTable.
todo = false;
List<ShardRouting> shards = new ArrayList<ShardRouting>(1+Router.this.yellowShards.size());
shards.add(primaryShardRouting);
for(DiscoveryNode node2 : Router.this.yellowShards) {
UnassignedInfo info = null;
if (ShardRoutingState.UNASSIGNED.equals(Router.this.getShardRoutingState(node2)))
info = IndexRoutingTable.UNASSIGNED_INFO_UNAVAILABLE;
if (node2.status() != DiscoveryNodeStatus.ALIVE)
info = IndexRoutingTable.UNASSIGNED_INFO_NODE_LEFT;
// unassigned secondary shards (yellow)
ShardRouting replicaShardRouting = new ShardRouting(index, shardId, node2.id(), false,
Router.this.getShardRoutingState(node2), version, info, EMPTY_RANGE_TOKEN_LIST);
shards.add(replicaShardRouting);
}
isrt.add( new IndexShardRoutingTable(new ShardId(index,shardId), shards) );
} else {
isrt.add( new IndexShardRoutingTable(new ShardId(index,shardId), primaryShardRouting) );
}
if (shardId != 0)
i++;
}
if (Router.this.redShards != null) {
for(DiscoveryNode node : Router.this.redShards.keySet()) {
int shardId = (localNode.id().equals(node.id())) ? 0 : i;
UnassignedInfo info = null;
if (ShardRoutingState.UNASSIGNED.equals(Router.this.getShardRoutingState(node)))
info = IndexRoutingTable.UNASSIGNED_INFO_UNAVAILABLE;
if (node.status() != DiscoveryNodeStatus.ALIVE)
info = IndexRoutingTable.UNASSIGNED_INFO_NODE_LEFT;
// add one unassigned primary shards (red) for orphan token ranges.
ShardRouting primaryShardRouting = new ShardRouting(index, shardId, node.id(), true,
Router.this.getShardRoutingState(node), version,
info,
Router.this.getTokenRanges(Router.this.redShards.get(node)));
isrt.add( new IndexShardRoutingTable(new ShardId(index,shardId), primaryShardRouting) );
if (shardId != 0)
i++;
}
}
// shuffle shards to distribute fetch requests on first shard.
Collections.shuffle(isrt);
return isrt;
}
}
};
public static Class<AbstractSearchStrategy> getSearchStrategyClass(String cls) throws ConfigurationException
{
String className = cls.contains(".") ? cls : "org.elassandra.cluster.routing." + cls;
Class<AbstractSearchStrategy> searchClass = FBUtilities.classForName(className, "search strategy");
if (!AbstractSearchStrategy.class.isAssignableFrom(searchClass))
{
throw new ConfigurationException(String.format((Locale)null, "Specified search strategy class (%s) is not derived from AbstractSearchStrategy", className));
}
return searchClass;
}
}