/**
* This file is part of Graylog.
*
* Graylog is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Graylog is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Graylog. If not, see <http://www.gnu.org/licenses/>.
*/
package org.graylog2.indexer.cluster;
import com.github.joschi.jadconfig.util.Duration;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Ints;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.searchbox.client.JestClient;
import io.searchbox.client.JestResult;
import io.searchbox.cluster.Health;
import io.searchbox.cluster.NodesInfo;
import io.searchbox.core.Cat;
import io.searchbox.core.CatResult;
import org.graylog2.indexer.IndexSetRegistry;
import org.graylog2.indexer.cluster.jest.JestUtils;
import org.graylog2.indexer.gson.GsonUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.graylog2.indexer.gson.GsonUtils.asInteger;
@Singleton
public class Cluster {
private static final Logger LOG = LoggerFactory.getLogger(Cluster.class);
private final JestClient jestClient;
private final IndexSetRegistry indexSetRegistry;
private final ScheduledExecutorService scheduler;
private final Duration requestTimeout;
@Inject
public Cluster(JestClient jestClient,
IndexSetRegistry indexSetRegistry,
@Named("daemonScheduler") ScheduledExecutorService scheduler,
@Named("elasticsearch_request_timeout") Duration requestTimeout) {
this.scheduler = scheduler;
this.jestClient = jestClient;
this.indexSetRegistry = indexSetRegistry;
this.requestTimeout = requestTimeout;
}
private JsonObject clusterHealth(Collection<? extends String> indices) {
final Health request = new Health.Builder()
.addIndex(indices)
.build();
final JestResult jestResult = JestUtils.execute(jestClient, request, () -> "Couldn't read cluster health for indices " + indices);
return jestResult.getJsonObject();
}
/**
* Requests the cluster health for all indices managed by Graylog. (default: graylog_*)
*
* @return the cluster health response
*/
public Optional<JsonObject> health() {
return Optional.of(clusterHealth(Arrays.asList(indexSetRegistry.getIndexWildcards())));
}
/**
* Requests the cluster health for the current write index. (deflector)
*
* This can be used to decide if the current write index is healthy and writable even when older indices have
* problems.
*
* @return the cluster health response
*/
public Optional<JsonObject> deflectorHealth() {
return Optional.of(clusterHealth(Arrays.asList(indexSetRegistry.getWriteIndexAliases())));
}
/**
* Retrieve the response for the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/cat-nodes.html">cat nodes</a> request from Elasticsearch.
*
* @param fields The fields to show, see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/cat-nodes.html">cat nodes API</a>.
* @return A {@link JsonArray} with the result of the cat nodes request.
*/
private JsonArray catNodes(String... fields) {
final String fieldNames = String.join(",", fields);
final Cat request = new Cat.NodesBuilder()
.setParameter("h", fieldNames)
.setParameter("full_id", true)
.build();
final CatResult response = JestUtils.execute(jestClient, request, () -> "Unable to read Elasticsearch node information");
return GsonUtils.asJsonArray(response.getJsonObject().get("result"));
}
public Set<NodeFileDescriptorStats> getFileDescriptorStats() {
final JsonArray nodes = catNodes("name", "host", "fileDescriptorMax");
final ImmutableSet.Builder<NodeFileDescriptorStats> setBuilder = ImmutableSet.builder();
for (JsonElement jsonElement : nodes) {
if (jsonElement.isJsonObject()) {
final JsonObject jsonObject = jsonElement.getAsJsonObject();
final String name = GsonUtils.asString(jsonObject.get("name"));
final String host = GsonUtils.asString(jsonObject.get("host"));
final Long maxFileDescriptors = GsonUtils.asLong(jsonObject.get("fileDescriptorMax"));
setBuilder.add(NodeFileDescriptorStats.create(name, host, maxFileDescriptors));
}
}
return setBuilder.build();
}
public Optional<String> nodeIdToName(String nodeId) {
return getNodeInfo(nodeId).map(nodeInfo -> GsonUtils.asString(nodeInfo.get("name")));
}
public Optional<String> nodeIdToHostName(String nodeId) {
return getNodeInfo(nodeId).map(nodeInfo -> GsonUtils.asString(nodeInfo.get("host")));
}
private Optional<JsonObject> getNodeInfo(String nodeId) {
if (nodeId == null || nodeId.isEmpty()) {
return Optional.empty();
}
final NodesInfo request = new NodesInfo.Builder().addNode(nodeId).build();
final JestResult result = JestUtils.execute(jestClient, request, () -> "Couldn't read information of Elasticsearch node " + nodeId);
return Optional.ofNullable(result.getJsonObject())
.map(json -> GsonUtils.asJsonObject(json.get("nodes")))
.map(nodes -> GsonUtils.asJsonObject(nodes.get(nodeId)));
}
/**
* Check if Elasticsearch is available and that there are data nodes in the cluster.
*
* @return {@code true} if the Elasticsearch client is up and the cluster contains data nodes, {@code false} otherwise
*/
public boolean isConnected() {
final Health request = new Health.Builder()
.local()
.timeout(Ints.saturatedCast(requestTimeout.toSeconds()))
.build();
final JestResult result = JestUtils.execute(jestClient, request, () -> "Couldn't check connection status of Elasticsearch");
final int numberOfDataNodes = Optional.of(result.getJsonObject())
.map(json -> asInteger(json.get("number_of_data_nodes")))
.orElse(0);
return numberOfDataNodes > 0;
}
/**
* Check if the cluster health status is not {@literal RED} and that the
* {@link IndexSetRegistry#isUp() deflector is up}.
*
* @return {@code true} if the the cluster is healthy and the deflector is up, {@code false} otherwise
*/
public boolean isHealthy() {
return health()
.map(health -> GsonUtils.asString(health.get("status")))
.map(status -> !status.equals("red"))
.map(healthy -> healthy && indexSetRegistry.isUp())
.orElse(false);
}
/**
* Check if the deflector (write index) health status is not {@literal RED} and that the
* {@link IndexSetRegistry#isUp() deflector is up}.
*
* @return {@code true} if the deflector is healthy and up, {@code false} otherwise
*/
public boolean isDeflectorHealthy() {
return deflectorHealth()
.map(health -> GsonUtils.asString(health.get("status")))
.map(status -> !status.equals("red"))
.map(healthy -> healthy && indexSetRegistry.isUp())
.orElse(false);
}
/**
* Blocks until the Elasticsearch cluster and current write index is healthy again or the given timeout fires.
*
* @param timeout the timeout value
* @param unit the timeout unit
* @throws InterruptedException
* @throws TimeoutException
*/
public void waitForConnectedAndDeflectorHealthy(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException {
LOG.debug("Waiting until the write-active index is healthy again, checking once per second.");
final CountDownLatch latch = new CountDownLatch(1);
final ScheduledFuture<?> scheduledFuture = scheduler.scheduleAtFixedRate(() -> {
try {
if (isConnected() && isDeflectorHealthy()) {
LOG.debug("Write-active index is healthy again, unblocking waiting threads.");
latch.countDown();
}
} catch (Exception ignore) {
} // to not cancel the schedule
}, 0, 1, TimeUnit.SECONDS); // TODO should this be configurable?
final boolean waitSuccess = latch.await(timeout, unit);
scheduledFuture.cancel(true); // Make sure to cancel the task to avoid task leaks!
if (!waitSuccess) {
throw new TimeoutException("Write-active index didn't get healthy within timeout");
}
}
/**
* Blocks until the Elasticsearch cluster and current write index is healthy again or the default timeout fires.
*
* @throws InterruptedException
* @throws TimeoutException
*/
public void waitForConnectedAndDeflectorHealthy() throws InterruptedException, TimeoutException {
waitForConnectedAndDeflectorHealthy(requestTimeout.getQuantity(), requestTimeout.getUnit());
}
}