package focusedCrawler.target.repository;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.entity.AbstractHttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.nio.entity.NStringEntity;
import org.apache.http.util.EntityUtils;
import org.elasticsearch.client.Response;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.TargetModelElasticSearch;
import focusedCrawler.target.repository.elasticsearch.ElasticSearchConfig;
public class ElasticSearchRestTargetRepository implements TargetRepository {
private static final Map<String, String> EMPTY_MAP = Collections.<String, String>emptyMap();
private static final Logger logger = LoggerFactory.getLogger(ElasticSearchRestTargetRepository.class);
private static final ObjectMapper mapper = new ObjectMapper();
static {
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
}
private RestClient client;
private String typeName;
private String indexName;
public ElasticSearchRestTargetRepository(ElasticSearchConfig config,
String indexName,
String typeName) {
this.indexName = indexName;
this.typeName = typeName;
this.client = createRestClient(config);
this.createIndexMapping(indexName);
}
private void createIndexMapping(String indexName) {
String indexEndpoint = "/" + indexName;
boolean exists = false;
String esVersion = "5.x.x";
try {
Response existsResponse = client.performRequest("HEAD", indexEndpoint);
exists = (existsResponse.getStatusLine().getStatusCode() == 200);
Response rootResponse = client.performRequest("GET", "/");
String json = EntityUtils.toString(rootResponse.getEntity());
String versionNumber = mapper.readTree(json).path("version").path("number").asText();
if (versionNumber != null && !versionNumber.isEmpty()) {
esVersion = versionNumber;
}
logger.info("Elasticsearch version: {}", esVersion);
} catch (IOException e) {
throw new RuntimeException(
"Failed to check whether index already exists in Elasticsearch.", e);
}
if (!exists) {
final String targetMapping1x = ""
+ "{"
+ " \"properties\": {"
+ " \"domain\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"words\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"wordsMeta\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"retrieved\": {\"type\": \"date\",\"format\": \"dateOptionalTime\"},"
+ " \"text\": {\"type\": \"string\"},"
+ " \"title\": {\"type\": \"string\"},"
+ " \"url\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"topPrivateDomain\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"isRelevant\": {\"type\": \"string\",\"index\": \"not_analyzed\"},"
+ " \"relevance\": {\"type\": \"double\"}"
+ " }"
+ "}";
final String pageMapping5x =""
+ "{"
+ " \"properties\": {"
+ " \"domain\": {\"type\": \"keyword\",\"index\": true},"
+ " \"words\": {\"type\": \"keyword\",\"index\": true},"
+ " \"wordsMeta\": {\"type\": \"keyword\",\"index\": true},"
+ " \"retrieved\": {\"type\": \"date\",\"format\": \"dateOptionalTime\"},"
+ " \"text\": {\"type\": \"text\"},"
+ " \"title\": {\"type\": \"text\"},"
+ " \"url\": {\"type\": \"keyword\",\"index\": true},"
+ " \"topPrivateDomain\": {\"type\": \"keyword\",\"index\": true},"
+ " \"isRelevant\": {\"type\": \"keyword\",\"index\": true},"
+ " \"relevance\": {\"type\": \"double\"}"
+ " }"
+ "}";
String pageProperties = esVersion.startsWith("5.") ? pageMapping5x : targetMapping1x;
String mapping =
"{"
+ " \"mappings\": {"
+ " \"" + typeName + "\": " + pageProperties
+ " }"
+ "}";
try {
AbstractHttpEntity entity = createJsonEntity(mapping);
Response response = client.performRequest("PUT", indexEndpoint, EMPTY_MAP, entity);
if (response.getStatusLine().getStatusCode() != 200) {
throw new RuntimeException(
"Failed to create index in Elasticsearch." + response.toString());
}
} catch (IOException e) {
throw new RuntimeException("Failed to create index in Elasticsearch.", e);
}
}
}
private AbstractHttpEntity createJsonEntity(String mapping) {
return new NStringEntity(mapping, ContentType.APPLICATION_JSON);
}
@Override
public boolean insert(Page page) {
TargetModelElasticSearch data = new TargetModelElasticSearch(page);
String docId = encodeUrl(page.getURL().toString());
String endpoint = "/" + indexName + "/" + typeName + "/" + docId;
AbstractHttpEntity entity = createJsonEntity(serializeAsJson(data));
try {
Response response = client.performRequest("PUT", endpoint, EMPTY_MAP, entity);
return response.getStatusLine().getStatusCode() == 201;
} catch (IOException e) {
throw new RuntimeException("Failed to index page.", e);
}
}
private String encodeUrl(String url) {
try {
return URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("Failed to URL encode string: "+url, e);
}
}
private String serializeAsJson(Object model) {
String targetAsJson;
try {
targetAsJson = mapper.writeValueAsString(model);
} catch (JsonProcessingException e) {
throw new RuntimeException("Failed to serialize TargetModel to JSON.", e);
}
return targetAsJson;
}
public RestClient createRestClient(ElasticSearchConfig config) {
List<String> esHosts = config.getRestApiHosts();
List<HttpHost> hosts = new ArrayList<>();
for (String host : esHosts) {
try {
URL url = new URL(host);
hosts.add(new HttpHost(url.getHost(), url.getPort()));
} catch (MalformedURLException e) {
throw new RuntimeException("Failed to initialize Elasticsearch REST client. "
+ "Invalid host: " + host, e);
}
}
HttpHost[] httpHostsArray = (HttpHost[]) hosts.toArray(new HttpHost[hosts.size()]);
client = RestClient.builder(httpHostsArray)
.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
@Override
public RequestConfig.Builder customizeRequestConfig(RequestConfig.Builder requestConfigBuilder) {
return requestConfigBuilder
.setConnectTimeout(config.getRestConnectTimeout())
.setSocketTimeout(config.getRestSocketTimeout());
}
})
.setMaxRetryTimeoutMillis(config.getRestMaxRetryTimeoutMillis())
.build();
logger.info("Initialized Elasticsearch REST client for hosts: "+Arrays.toString(httpHostsArray));
return client;
}
@Override
public void close() {
try {
if (client != null) {
client.close();
}
} catch (IOException e) {
throw new RuntimeException("Failed to close Elasticsearch REST client", e);
}
}
}