/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//TODO refactor the dependencies out of root ivy file
package org.apache.nutch.indexwriter.elasticrest;
import io.searchbox.client.JestClient;
import io.searchbox.client.JestClientFactory;
import io.searchbox.client.JestResult;
import io.searchbox.client.JestResultHandler;
import io.searchbox.client.config.HttpClientConfig;
import io.searchbox.core.Bulk;
import io.searchbox.core.BulkResult;
import io.searchbox.core.Delete;
import io.searchbox.core.Index;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.http.HttpResponse;
import org.apache.http.concurrent.BasicFuture;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.nio.conn.SchemeIOSessionStrategy;
import org.apache.http.nio.conn.ssl.SSLIOSessionStrategy;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.NutchDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
/**
*/
public class ElasticRestIndexWriter implements IndexWriter {
public static Logger LOG = LoggerFactory
.getLogger(ElasticRestIndexWriter.class);
private static final int DEFAULT_MAX_BULK_DOCS = 250;
private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
private JestClient client;
private String defaultIndex;
private String defaultType = null;
private Configuration config;
private Bulk.Builder bulkBuilder;
private Future<HttpResponse> execute;
private int port = -1;
private String host = null;
private Boolean https = null;
private String user = null;
private String password = null;
private Boolean trustAllHostnames = null;
private int maxBulkDocs;
private int maxBulkLength;
private long indexedDocs = 0;
private int bulkDocs = 0;
private int bulkLength = 0;
private boolean createNewBulk = false;
private long millis;
private BasicFuture<JestResult> basicFuture = null;
@Override
public void open(JobConf job, String name) throws IOException {
host = job.get(ElasticRestConstants.HOST);
port = job.getInt(ElasticRestConstants.PORT, 9200);
user = job.get(ElasticRestConstants.USER);
password = job.get(ElasticRestConstants.PASSWORD);
https = job.getBoolean(ElasticRestConstants.HTTPS, false);
trustAllHostnames = job.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false);
// trust ALL certificates
SSLContext sslContext = null;
try {
sslContext = new SSLContextBuilder()
.loadTrustMaterial(new TrustStrategy() {
public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
return true;
}
}).build();
} catch (NoSuchAlgorithmException | KeyManagementException | KeyStoreException e) {
LOG.error("Failed to instantiate sslcontext object: \n{}",
ExceptionUtils.getStackTrace(e));
throw new SecurityException();
}
// skip hostname checks
HostnameVerifier hostnameVerifier = null;
if (trustAllHostnames) {
hostnameVerifier = NoopHostnameVerifier.INSTANCE;
} else {
hostnameVerifier = new DefaultHostnameVerifier();
}
SSLConnectionSocketFactory sslSocketFactory = new SSLConnectionSocketFactory(sslContext);
SchemeIOSessionStrategy httpsIOSessionStrategy = new SSLIOSessionStrategy(sslContext, hostnameVerifier);
JestClientFactory jestClientFactory = new JestClientFactory();
URL urlOfElasticsearchNode = new URL(https ? "https" : "http", host, port, "");
if (host != null && port > 1) {
HttpClientConfig.Builder builder = new HttpClientConfig.Builder(
urlOfElasticsearchNode.toString()).multiThreaded(true)
.connTimeout(300000).readTimeout(300000);
if (https) {
if (user != null && password != null) {
builder.defaultCredentials(user, password);
}
builder.defaultSchemeForDiscoveredNodes("https")
.sslSocketFactory(sslSocketFactory) // this only affects sync calls
.httpsIOSessionStrategy(httpsIOSessionStrategy); // this only affects async calls
}
jestClientFactory.setHttpClientConfig(builder.build());
} else {
throw new IllegalStateException("No host or port specified. Please set the host and port in nutch-site.xml");
}
client = jestClientFactory.getObject();
defaultIndex = job.get(ElasticRestConstants.INDEX, "nutch");
defaultType = job.get(ElasticRestConstants.TYPE, "doc");
maxBulkDocs = job.getInt(ElasticRestConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS);
maxBulkLength = job.getInt(ElasticRestConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH);
bulkBuilder = new Bulk.Builder().defaultIndex(defaultIndex).defaultType(defaultType);
}
@Override
public void write(NutchDocument doc) throws IOException {
String id = (String) doc.getFieldValue("id");
String type = doc.getDocumentMeta().get("type");
if (type == null) {
type = defaultType;
}
Map<String, Object> source = new HashMap<String, Object>();
// Loop through all fields of this doc
for (String fieldName : doc.getFieldNames()) {
if (doc.getField(fieldName).getValues().size() > 1) {
source.put(fieldName, doc.getFieldValue(fieldName));
// Loop through the values to keep track of the size of this
// document
for (Object value : doc.getField(fieldName).getValues()) {
bulkLength += value.toString().length();
}
} else {
source.put(fieldName, doc.getFieldValue(fieldName));
bulkLength += doc.getFieldValue(fieldName).toString().length();
}
}
Index indexRequest = new Index.Builder(source).index(defaultIndex)
.type(type).id(id).build();
// Add this indexing request to a bulk request
bulkBuilder.addAction(indexRequest);
indexedDocs++;
bulkDocs++;
if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
LOG.info(
"Processing bulk request [docs = {}, length = {}, total docs = {}, last doc in bulk = '{}']",
bulkDocs, bulkLength, indexedDocs, id);
// Flush the bulk of indexing requests
createNewBulk = true;
commit();
}
}
@Override
public void delete(String key) throws IOException {
try {
client.execute(new Delete.Builder(key).index(defaultIndex)
.type(defaultType).build());
} catch (IOException e) {
LOG.error(ExceptionUtils.getStackTrace(e));
throw e;
}
}
@Override
public void update(NutchDocument doc) throws IOException {
try {
write(doc);
} catch (IOException e) {
LOG.error(ExceptionUtils.getStackTrace(e));
throw e;
}
}
@Override
public void commit() throws IOException {
if (basicFuture != null) {
// wait for previous to finish
long beforeWait = System.currentTimeMillis();
try {
JestResult result = basicFuture.get();
if (result == null) {
throw new RuntimeException();
}
long msWaited = System.currentTimeMillis() - beforeWait;
LOG.info("Previous took in ms {}, including wait {}", millis, msWaited);
} catch (InterruptedException | ExecutionException e) {
LOG.error("Error waiting for result ", e);
}
basicFuture = null;
}
if (bulkBuilder != null) {
if (bulkDocs > 0) {
// start a flush, note that this is an asynchronous call
basicFuture = new BasicFuture<>(null);
millis = System.currentTimeMillis();
client.executeAsync(bulkBuilder.build(),
new JestResultHandler<BulkResult>() {
@Override
public void completed(BulkResult bulkResult) {
basicFuture.completed(bulkResult);
millis = System.currentTimeMillis() - millis;
}
@Override
public void failed(Exception e) {
basicFuture.completed(null);
LOG.error("Failed result: ", e);
}
});
}
bulkBuilder = null;
}
if (createNewBulk) {
// Prepare a new bulk request
bulkBuilder = new Bulk.Builder().defaultIndex(defaultIndex)
.defaultType(defaultType);
bulkDocs = 0;
bulkLength = 0;
}
}
@Override
public void close() throws IOException {
// Flush pending requests
LOG.info(
"Processing remaining requests [docs = {}, length = {}, total docs = {}]",
bulkDocs, bulkLength, indexedDocs);
createNewBulk = false;
commit();
// flush one more time to finalize the last bulk
LOG.info("Processing to finalize last execute");
createNewBulk = false;
commit();
// Close
client.shutdownClient();
}
@Override
public String describe() {
StringBuffer sb = new StringBuffer("ElasticRestIndexWriter\n");
sb.append("\t").append(ElasticRestConstants.HOST).append(" : hostname\n");
sb.append("\t").append(ElasticRestConstants.PORT).append(" : port\n");
sb.append("\t").append(ElasticRestConstants.INDEX)
.append(" : elastic index command \n");
sb.append("\t").append(ElasticRestConstants.MAX_BULK_DOCS)
.append(" : elastic bulk index doc counts. (default 250) \n");
sb.append("\t").append(ElasticRestConstants.MAX_BULK_LENGTH)
.append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n");
return sb.toString();
}
@Override
public void setConf(Configuration conf) {
config = conf;
String host = conf.get(ElasticRestConstants.HOST);
String port = conf.get(ElasticRestConstants.PORT);
if (StringUtils.isBlank(host) && StringUtils.isBlank(port)) {
String message = "Missing elastic.rest.host and elastic.rest.port. At least one of them should be set in nutch-site.xml ";
message += "\n" + describe();
LOG.error(message);
throw new RuntimeException(message);
}
}
@Override
public Configuration getConf() {
return config;
}
}