package org.apache.solr.cloud;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.noggit.JSONUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.ClosableThread;
import org.apache.solr.common.cloud.HashPartitioner;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.cloud.ZooKeeperException;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Cluster leader. Responsible node assignments, cluster state file?
*/
public class Overseer {
public static final String QUEUE_OPERATION = "operation";
private static final int STATE_UPDATE_DELAY = 500; // delay between cloud state updates
private static Logger log = LoggerFactory.getLogger(Overseer.class);
private class ClusterStateUpdater implements Runnable, ClosableThread {
private static final String DELETECORE = "deletecore";
private final ZkStateReader reader;
private final SolrZkClient zkClient;
private final String myId;
//queue where everybody can throw tasks
private final DistributedQueue stateUpdateQueue;
//Internal queue where overseer stores events that have not yet been published into cloudstate
//If Overseer dies while extracting the main queue a new overseer will start from this queue
private final DistributedQueue workQueue;
private volatile boolean isClosed;
public ClusterStateUpdater(final ZkStateReader reader, final String myId) {
this.zkClient = reader.getZkClient();
this.stateUpdateQueue = getInQueue(zkClient);
this.workQueue = getInternalQueue(zkClient);
this.myId = myId;
this.reader = reader;
}
@Override
public void run() {
if(!this.isClosed && amILeader()) {
// see if there's something left from the previous Overseer and re
// process all events that were not persisted into cloud state
synchronized (reader.getUpdateLock()) { //XXX this only protects against edits inside single node
try {
byte[] head = workQueue.peek();
if (head != null) {
reader.updateClusterState(true);
ClusterState clusterState = reader.getClusterState();
log.info("Replaying operations from work queue.");
while (head != null && amILeader()) {
final ZkNodeProps message = ZkNodeProps.load(head);
final String operation = message
.getStr(QUEUE_OPERATION);
clusterState = processMessage(clusterState, message, operation);
zkClient.setData(ZkStateReader.CLUSTER_STATE,
ZkStateReader.toJSON(clusterState), true);
workQueue.remove();
head = workQueue.peek();
}
}
} catch (KeeperException e) {
if (e.code() == KeeperException.Code.SESSIONEXPIRED
|| e.code() == KeeperException.Code.CONNECTIONLOSS) {
log.warn("Solr cannot talk to ZK");
return;
}
SolrException.log(log, "", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
"", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
}
}
}
log.info("Starting to work on the main queue");
while (!this.isClosed && amILeader()) {
synchronized (reader.getUpdateLock()) {
try {
byte[] head = stateUpdateQueue.peek();
if (head != null) {
reader.updateClusterState(true);
ClusterState clusterState = reader.getClusterState();
while (head != null) {
final ZkNodeProps message = ZkNodeProps.load(head);
final String operation = message.getStr(QUEUE_OPERATION);
clusterState = processMessage(clusterState, message, operation);
workQueue.offer(head);
stateUpdateQueue.remove();
head = stateUpdateQueue.peek();
}
zkClient.setData(ZkStateReader.CLUSTER_STATE,
ZkStateReader.toJSON(clusterState), true);
}
// clean work queue
while (workQueue.poll() != null);
} catch (KeeperException e) {
if (e.code() == KeeperException.Code.SESSIONEXPIRED
|| e.code() == KeeperException.Code.CONNECTIONLOSS) {
log.warn("Overseer cannot talk to ZK");
return;
}
SolrException.log(log, "", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
"", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
}
}
try {
Thread.sleep(STATE_UPDATE_DELAY);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
private ClusterState processMessage(ClusterState clusterState,
final ZkNodeProps message, final String operation) {
if ("state".equals(operation)) {
clusterState = updateState(clusterState, message);
} else if (DELETECORE.equals(operation)) {
clusterState = removeCore(clusterState, message);
} else if (ZkStateReader.LEADER_PROP.equals(operation)) {
StringBuilder sb = new StringBuilder();
String baseUrl = message.getStr(ZkStateReader.BASE_URL_PROP);
String coreName = message.getStr(ZkStateReader.CORE_NAME_PROP);
sb.append(baseUrl);
if (baseUrl != null && !baseUrl.endsWith("/")) sb.append("/");
sb.append(coreName == null ? "" : coreName);
if (!(sb.substring(sb.length() - 1).equals("/"))) sb.append("/");
clusterState = setShardLeader(clusterState,
message.getStr(ZkStateReader.COLLECTION_PROP),
message.getStr(ZkStateReader.SHARD_ID_PROP),
sb.length() > 0 ? sb.toString() : null);
} else {
throw new RuntimeException("unknown operation:" + operation
+ " contents:" + message.getProperties());
}
return clusterState;
}
private boolean amILeader() {
try {
ZkNodeProps props = ZkNodeProps.load(zkClient.getData("/overseer_elect/leader", null, null, true));
if(myId.equals(props.getStr("id"))) {
return true;
}
} catch (KeeperException e) {
log.warn("", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
log.info("According to ZK I (id=" + myId + ") am no longer a leader.");
return false;
}
/**
* Try to assign core to the cluster.
*/
private ClusterState updateState(ClusterState state, final ZkNodeProps message) {
final String collection = message.getStr(ZkStateReader.COLLECTION_PROP);
final String zkCoreNodeName = message.getStr(ZkStateReader.NODE_NAME_PROP) + "_" + message.getStr(ZkStateReader.CORE_NAME_PROP);
final Integer numShards = message.getStr(ZkStateReader.NUM_SHARDS_PROP)!=null?Integer.parseInt(message.getStr(ZkStateReader.NUM_SHARDS_PROP)):null;
//collection does not yet exist, create placeholders if num shards is specified
if (!state.getCollections().contains(collection)
&& numShards!=null) {
state = createCollection(state, collection, numShards);
}
// use the provided non null shardId
String sliceName = message.getStr(ZkStateReader.SHARD_ID_PROP);
if (sliceName == null) {
String nodeName = message.getStr(ZkStateReader.NODE_NAME_PROP);
//get shardId from ClusterState
sliceName = getAssignedId(state, nodeName, message);
}
if(sliceName == null) {
//request new shardId
sliceName = AssignShard.assignShard(collection, state, numShards);
}
Slice slice = state.getSlice(collection, sliceName);
Map<String,Object> replicaProps = new LinkedHashMap<String,Object>();
replicaProps.putAll(message.getProperties());
// System.out.println("########## UPDATE MESSAGE: " + JSONUtil.toJSON(message));
if (slice != null) {
Replica oldReplica = slice.getReplicasMap().get(zkCoreNodeName);
if (oldReplica != null && oldReplica.containsKey(ZkStateReader.LEADER_PROP)) {
replicaProps.put(ZkStateReader.LEADER_PROP, oldReplica.get(ZkStateReader.LEADER_PROP));
}
}
// we don't put num_shards in the clusterstate
replicaProps.remove(ZkStateReader.NUM_SHARDS_PROP);
replicaProps.remove(QUEUE_OPERATION);
Replica replica = new Replica(zkCoreNodeName, replicaProps);
// TODO: where do we get slice properties in this message? or should there be a separate create-slice message if we want that?
Map<String,Object> sliceProps = null;
Map<String,Replica> replicas;
if (slice != null) {
sliceProps = slice.getProperties();
replicas = slice.getReplicasCopy();
} else {
replicas = new HashMap<String, Replica>(1);
}
replicas.put(replica.getName(), replica);
slice = new Slice(sliceName, replicas, sliceProps);
ClusterState newClusterState = updateSlice(state, collection, slice);
return newClusterState;
}
private ClusterState createCollection(ClusterState state, String collectionName, int numShards) {
HashPartitioner hp = new HashPartitioner();
List<HashPartitioner.Range> ranges = hp.partitionRange(numShards, hp.fullRange());
Map<String, Map<String, Slice>> newStates = new LinkedHashMap<String,Map<String, Slice>>();
Map<String, Slice> newSlices = new LinkedHashMap<String,Slice>();
newStates.putAll(state.getCollectionStates());
for (int i = 0; i < numShards; i++) {
final String sliceName = "shard" + (i+1);
Map<String,Object> sliceProps = new LinkedHashMap<String,Object>(1);
sliceProps.put(Slice.RANGE, ranges.get(i));
newSlices.put(sliceName, new Slice(sliceName, null, sliceProps));
}
newStates.put(collectionName, newSlices);
ClusterState newClusterState = new ClusterState(state.getLiveNodes(), newStates);
return newClusterState;
}
/*
* Return an already assigned id or null if not assigned
*/
private String getAssignedId(final ClusterState state, final String nodeName,
final ZkNodeProps coreState) {
final String key = coreState.getStr(ZkStateReader.NODE_NAME_PROP) + "_" + coreState.getStr(ZkStateReader.CORE_NAME_PROP);
Map<String, Slice> slices = state.getSlices(coreState.getStr(ZkStateReader.COLLECTION_PROP));
if (slices != null) {
for (Slice slice : slices.values()) {
if (slice.getReplicasMap().get(key) != null) {
return slice.getName();
}
}
}
return null;
}
private ClusterState updateSlice(ClusterState state, String collection, Slice slice) {
// System.out.println("###!!!### OLD CLUSTERSTATE: " + JSONUtil.toJSON(state.getCollectionStates()));
// System.out.println("Updating slice:" + slice);
Map<String, Map<String, Slice>> newCollections = new LinkedHashMap<String,Map<String,Slice>>(state.getCollectionStates()); // make a shallow copy
Map<String, Slice> slices = newCollections.get(collection);
if (slices == null) {
slices = new HashMap<String, Slice>(1);
} else {
slices = new LinkedHashMap<String, Slice>(slices); // make a shallow copy
}
slices.put(slice.getName(), slice);
newCollections.put(collection, slices);
// System.out.println("###!!!### NEW CLUSTERSTATE: " + JSONUtil.toJSON(newCollections));
return new ClusterState(state.getLiveNodes(), newCollections);
}
private ClusterState setShardLeader(ClusterState state, String collection, String sliceName, String leaderUrl) {
final Map<String, Map<String, Slice>> newStates = new LinkedHashMap<String,Map<String,Slice>>(state.getCollectionStates());
Map<String, Slice> slices = newStates.get(collection);
if(slices==null) {
log.error("Could not mark shard leader for non existing collection:" + collection);
return state;
}
// make a shallow copy and add it to the new collection
slices = new LinkedHashMap<String,Slice>(slices);
newStates.put(collection, slices);
Slice slice = slices.get(sliceName);
if (slice == null) {
log.error("Could not mark leader for non existing slice:" + sliceName);
return state;
} else {
// TODO: consider just putting the leader property on the shard, not on individual replicas
Replica oldLeader = slice.getLeader();
final Map<String,Replica> newReplicas = new LinkedHashMap<String,Replica>();
for (Replica replica : slice.getReplicas()) {
// TODO: this should only be calculated once and cached somewhere?
String coreURL = ZkCoreNodeProps.getCoreUrl(replica.getStr(ZkStateReader.BASE_URL_PROP), replica.getStr(ZkStateReader.CORE_NAME_PROP));
if (replica == oldLeader && !coreURL.equals(leaderUrl)) {
Map<String,Object> replicaProps = new LinkedHashMap<String,Object>(replica.getProperties());
replicaProps.remove(Slice.LEADER);
replica = new Replica(replica.getName(), replicaProps);
} else if (coreURL.equals(leaderUrl)) {
Map<String,Object> replicaProps = new LinkedHashMap<String,Object>(replica.getProperties());
replicaProps.put(Slice.LEADER, "true"); // TODO: allow booleans instead of strings
replica = new Replica(replica.getName(), replicaProps);
}
newReplicas.put(replica.getName(), replica);
}
Map<String,Object> newSliceProps = slice.shallowCopy();
newSliceProps.put(Slice.REPLICAS, newReplicas);
Slice newSlice = new Slice(slice.getName(), newReplicas, slice.getProperties());
slices.put(newSlice.getName(), newSlice);
}
return new ClusterState(state.getLiveNodes(), newStates);
}
/*
* Remove core from cloudstate
*/
private ClusterState removeCore(final ClusterState clusterState, ZkNodeProps message) {
final String coreNodeName = message.getStr(ZkStateReader.NODE_NAME_PROP) + "_" + message.getStr(ZkStateReader.CORE_NAME_PROP);
final String collection = message.getStr(ZkStateReader.COLLECTION_PROP);
final LinkedHashMap<String, Map<String, Slice>> newStates = new LinkedHashMap<String,Map<String,Slice>>();
for(String collectionName: clusterState.getCollections()) {
if(collection.equals(collectionName)) {
Map<String, Slice> slices = clusterState.getSlices(collection);
LinkedHashMap<String, Slice> newSlices = new LinkedHashMap<String, Slice>();
for(Slice slice: slices.values()) {
if(slice.getReplicasMap().containsKey(coreNodeName)) {
Map<String, Replica> newReplicas = slice.getReplicasCopy();
newReplicas.remove(coreNodeName);
Slice newSlice = new Slice(slice.getName(), newReplicas, slice.getProperties());
newSlices.put(slice.getName(), newSlice);
} else {
newSlices.put(slice.getName(), slice);
}
}
int cnt = 0;
for (Slice slice : newSlices.values()) {
cnt+=slice.getReplicasMap().size();
}
// TODO: if no nodes are left after this unload
// remove from zk - do we have a race where Overseer
// see's registered nodes and publishes though?
if (cnt > 0) {
newStates.put(collectionName, newSlices);
} else {
// TODO: it might be better logically to have this in ZkController
// but for tests (it's easier) it seems better for the moment to leave CoreContainer and/or
// ZkController out of the Overseer.
try {
zkClient.clean("/collections/" + collectionName);
} catch (InterruptedException e) {
SolrException.log(log, "Cleaning up collection in zk was interrupted:" + collectionName, e);
Thread.currentThread().interrupt();
} catch (KeeperException e) {
SolrException.log(log, "Problem cleaning up collection in zk:" + collectionName, e);
}
}
} else {
newStates.put(collectionName, clusterState.getSlices(collectionName));
}
}
ClusterState newState = new ClusterState(clusterState.getLiveNodes(), newStates);
return newState;
}
@Override
public void close() {
this.isClosed = true;
}
@Override
public boolean isClosed() {
return this.isClosed;
}
}
class OverseerThread extends Thread implements ClosableThread {
private volatile boolean isClosed;
public OverseerThread(ThreadGroup tg,
ClusterStateUpdater clusterStateUpdater) {
super(tg, clusterStateUpdater);
}
public OverseerThread(ThreadGroup ccTg,
OverseerCollectionProcessor overseerCollectionProcessor, String string) {
super(ccTg, overseerCollectionProcessor, string);
}
@Override
public void close() {
this.isClosed = true;
}
@Override
public boolean isClosed() {
return this.isClosed;
}
}
private OverseerThread ccThread;
private OverseerThread updaterThread;
private volatile boolean isClosed;
private ZkStateReader reader;
private ShardHandler shardHandler;
private String adminPath;
public Overseer(ShardHandler shardHandler, String adminPath, final ZkStateReader reader) throws KeeperException, InterruptedException {
this.reader = reader;
this.shardHandler = shardHandler;
this.adminPath = adminPath;
}
public void start(String id) {
log.info("Overseer (id=" + id + ") starting");
createOverseerNode(reader.getZkClient());
//launch cluster state updater thread
ThreadGroup tg = new ThreadGroup("Overseer state updater.");
updaterThread = new OverseerThread(tg, new ClusterStateUpdater(reader, id));
updaterThread.setDaemon(true);
ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process.");
ccThread = new OverseerThread(ccTg, new OverseerCollectionProcessor(reader, id, shardHandler, adminPath),
"Overseer-" + id);
ccThread.setDaemon(true);
updaterThread.start();
ccThread.start();
}
public void close() {
isClosed = true;
if (updaterThread != null) {
updaterThread.close();
updaterThread.interrupt();
}
if (ccThread != null) {
ccThread.close();
ccThread.interrupt();
}
}
/**
* Get queue that can be used to send messages to Overseer.
*/
public static DistributedQueue getInQueue(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new DistributedQueue(zkClient, "/overseer/queue", null);
}
/* Internal queue, not to be used outside of Overseer */
static DistributedQueue getInternalQueue(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new DistributedQueue(zkClient, "/overseer/queue-work", null);
}
/* Collection creation queue */
static DistributedQueue getCollectionQueue(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new DistributedQueue(zkClient, "/overseer/collection-queue-work", null);
}
private static void createOverseerNode(final SolrZkClient zkClient) {
try {
zkClient.create("/overseer", new byte[0], CreateMode.PERSISTENT, true);
} catch (KeeperException.NodeExistsException e) {
//ok
} catch (InterruptedException e) {
log.error("Could not create Overseer node", e);
Thread.currentThread().interrupt();
throw new RuntimeException(e);
} catch (KeeperException e) {
log.error("Could not create Overseer node", e);
throw new RuntimeException(e);
}
}
}