/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexwriter.elastic;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.NutchDocument;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Sends NutchDocuments to a configured Elasticsearch index.
*/
public class ElasticIndexWriter implements IndexWriter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static final int DEFAULT_PORT = 9300;
private static final int DEFAULT_MAX_BULK_DOCS = 250;
private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
private static final int DEFAULT_EXP_BACKOFF_MILLIS = 100;
private static final int DEFAULT_EXP_BACKOFF_RETRIES = 10;
private static final int DEFAULT_BULK_CLOSE_TIMEOUT = 600;
private static final String DEFAULT_INDEX = "nutch";
private String defaultIndex;
private Client client;
private Node node;
private BulkProcessor bulkProcessor;
private long bulkCloseTimeout;
private Configuration config;
@Override
public void open(JobConf job, String name) throws IOException {
bulkCloseTimeout = job.getLong(ElasticConstants.BULK_CLOSE_TIMEOUT,
DEFAULT_BULK_CLOSE_TIMEOUT);
defaultIndex = job.get(ElasticConstants.INDEX, DEFAULT_INDEX);
int maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS,
DEFAULT_MAX_BULK_DOCS);
int maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH,
DEFAULT_MAX_BULK_LENGTH);
int expBackoffMillis = job.getInt(ElasticConstants.EXPONENTIAL_BACKOFF_MILLIS,
DEFAULT_EXP_BACKOFF_MILLIS);
int expBackoffRetries = job.getInt(ElasticConstants.EXPONENTIAL_BACKOFF_RETRIES,
DEFAULT_EXP_BACKOFF_RETRIES);
client = makeClient(job);
LOG.debug("Creating BulkProcessor with maxBulkDocs={}, maxBulkLength={}", maxBulkDocs, maxBulkLength);
bulkProcessor = BulkProcessor.builder(client, bulkProcessorListener())
.setBulkActions(maxBulkDocs)
.setBulkSize(new ByteSizeValue(maxBulkLength, ByteSizeUnit.BYTES))
.setConcurrentRequests(1)
.setBackoffPolicy(BackoffPolicy.exponentialBackoff(
TimeValue.timeValueMillis(expBackoffMillis), expBackoffRetries))
.build();
}
/** Generates a TransportClient or NodeClient */
protected Client makeClient(Configuration conf) throws IOException {
String clusterName = conf.get(ElasticConstants.CLUSTER);
String[] hosts = conf.getStrings(ElasticConstants.HOSTS);
int port = conf.getInt(ElasticConstants.PORT, DEFAULT_PORT);
Settings.Builder settingsBuilder = Settings.settingsBuilder();
BufferedReader reader = new BufferedReader(
conf.getConfResourceAsReader("elasticsearch.conf"));
String line;
String parts[];
while ((line = reader.readLine()) != null) {
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
line.trim();
parts = line.split("=");
if (parts.length == 2) {
settingsBuilder.put(parts[0].trim(), parts[1].trim());
}
}
}
// Set the cluster name and build the settings
if (StringUtils.isNotBlank(clusterName))
settingsBuilder.put("cluster.name", clusterName);
Settings settings = settingsBuilder.build();
Client client = null;
// Prefer TransportClient
if (hosts != null && port > 1) {
TransportClient transportClient = TransportClient.builder().settings(settings).build();
for (String host: hosts)
transportClient.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port));
client = transportClient;
} else if (clusterName != null) {
node = nodeBuilder().settings(settings).client(true).node();
client = node.client();
}
return client;
}
/** Generates a default BulkProcessor.Listener */
protected BulkProcessor.Listener bulkProcessorListener() {
return new BulkProcessor.Listener() {
@Override
public void beforeBulk(long executionId, BulkRequest request) { }
@Override
public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
throw new RuntimeException(failure);
}
@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
if (response.hasFailures()) {
LOG.warn("Failures occurred during bulk request");
}
}
};
}
@Override
public void write(NutchDocument doc) throws IOException {
String id = (String) doc.getFieldValue("id");
String type = doc.getDocumentMeta().get("type");
if (type == null)
type = "doc";
// Add each field of this doc to the index source
Map<String, Object> source = new HashMap<String, Object>();
for (String fieldName : doc.getFieldNames()) {
if (doc.getFieldValue(fieldName) != null) {
source.put(fieldName, doc.getFieldValue(fieldName));
}
}
IndexRequest request = new IndexRequest(defaultIndex, type, id).source(source);
bulkProcessor.add(request);
}
@Override
public void delete(String key) throws IOException {
DeleteRequest request = new DeleteRequest(defaultIndex, "doc", key);
bulkProcessor.add(request);
}
@Override
public void update(NutchDocument doc) throws IOException {
write(doc);
}
@Override
public void commit() throws IOException {
bulkProcessor.flush();
}
@Override
public void close() throws IOException {
// Close BulkProcessor (automatically flushes)
try {
bulkProcessor.awaitClose(bulkCloseTimeout, TimeUnit.SECONDS);
} catch (InterruptedException e) {
LOG.warn("interrupted while waiting for BulkProcessor to complete ({})", e.getMessage());
}
client.close();
if (node != null) {
node.close();
}
}
@Override
public String describe() {
StringBuffer sb = new StringBuffer("ElasticIndexWriter\n");
sb.append("\t").append(ElasticConstants.CLUSTER)
.append(" : elastic prefix cluster\n");
sb.append("\t").append(ElasticConstants.HOSTS).append(" : hostname\n");
sb.append("\t").append(ElasticConstants.PORT).append(" : port\n");
sb.append("\t").append(ElasticConstants.INDEX)
.append(" : elastic index command \n");
sb.append("\t").append(ElasticConstants.MAX_BULK_DOCS)
.append(" : elastic bulk index doc counts. (default ")
.append(DEFAULT_MAX_BULK_DOCS).append(")\n");
sb.append("\t").append(ElasticConstants.MAX_BULK_LENGTH)
.append(" : elastic bulk index length in bytes. (default ")
.append(DEFAULT_MAX_BULK_LENGTH).append(")\n");
sb.append("\t").append(ElasticConstants.EXPONENTIAL_BACKOFF_MILLIS)
.append(" : elastic bulk exponential backoff initial delay in milliseconds. (default ")
.append(DEFAULT_EXP_BACKOFF_MILLIS).append(")\n");
sb.append("\t").append(ElasticConstants.EXPONENTIAL_BACKOFF_RETRIES)
.append(" : elastic bulk exponential backoff max retries. (default ")
.append(DEFAULT_EXP_BACKOFF_RETRIES).append(")\n");
sb.append("\t").append(ElasticConstants.BULK_CLOSE_TIMEOUT)
.append(" : elastic timeout for the last bulk in seconds. (default ")
.append(DEFAULT_BULK_CLOSE_TIMEOUT).append(")\n");
return sb.toString();
}
@Override
public void setConf(Configuration conf) {
config = conf;
String cluster = conf.get(ElasticConstants.CLUSTER);
String hosts = conf.get(ElasticConstants.HOSTS);
if (StringUtils.isBlank(cluster) && StringUtils.isBlank(hosts)) {
String message = "Missing elastic.cluster and elastic.host. At least one of them should be set in nutch-site.xml ";
message += "\n" + describe();
LOG.error(message);
throw new RuntimeException(message);
}
}
@Override
public Configuration getConf() {
return config;
}
}