/* Copyright 2013 Fabian Steeg, hbz. Licensed under the Eclipse Public License 1.0 */
package models;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.sort.SortBuilders;
import org.elasticsearch.search.sort.SortOrder;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import controllers.Serialization;
import models.queries.AbstractIndexQuery;
import models.queries.LobidItems;
import models.queries.LobidResources;
import play.Logger;
import play.mvc.Http.Request;
import play.mvc.Results.Chunks;
import play.mvc.Results.StringChunks;
/**
* Search documents in an ElasticSearch index.
*
* @author Fabian Steeg (fsteeg)
* @author Pascal Christoph (dr0i)
*/
public class Search {
/** The ElasticSearch server to use. */
public static final InetSocketTransportAddress ES_SERVER =
new InetSocketTransportAddress(
Index.CONFIG.getString("application.es.server"),
Index.CONFIG.getInt("application.es.port"));
/** The ElasticSearch cluster to use. */
public static final String ES_CLUSTER_NAME =
Index.CONFIG.getString("application.es.cluster");
private static Client productionClient = new TransportClient(ImmutableSettings
.settingsBuilder().put("cluster.name", ES_CLUSTER_NAME).build())
.addTransportAddress(ES_SERVER);
/** The ElasticSearch client to use. */
public static Client client = productionClient;
/* TODO find a better way to inject the client for testing */
/** Required: */
private final Index index;
private final Map<Parameter, String> parameters;
/** Optional: */
private String field = "";
private String owner = "";
private String set = "";
private int size = 50;
private int from = 0;
private String type = "";
private String sort = "";
private String scroll = "";
/**
* As there is a memory issue within Promise we musn't allow data > RAM. As
* experience shows, this is around 4 M docs (dependig on the serialisation).
*/
public static final int MAX_SCROLL_HITS = 3000000;
private List<Document> documents = null;
private Long hitCount = null;
private static boolean doingScrollScanNow = false;
private Chunks.Out<String> messageOut;
/**
* @param parameters The search parameters (see {@link Index#queries()} )
* @param index The index to search (see {@link Index})
*/
public Search(final Map<Parameter, String> parameters, final Index index) {
if (parameters == null) {
throw new IllegalArgumentException("Can't work with null parameters");
}
this.index = index;
this.parameters = parameters;
}
/**
* @param newClient The new elasticsearch client to use.
*/
public static void clientSet(Client newClient) {
client = newClient;
}
/** Reset the elasticsearch client. */
public static void clientReset() {
client = productionClient;
}
/**
* Execute the search and return its results.
*
* @return The documents matching this search
*/
public List<Document> documents() {
if (documents == null)
initResults();
return documents;
}
/**
* @return The total number of hits for this search.
*/
public long totalHits() {
if (hitCount == null)
initResults();
return hitCount;
}
private void initResults() {
Pair<List<Document>, Long> result = doSearch();
this.documents = result.getLeft();
this.hitCount = result.getRight();
}
private Pair<List<Document>, Long> doSearch() {
validateSearchParameters();
final QueryBuilder queryBuilder = createQuery();
Logger.trace("Using query: " + queryBuilder);
final SearchResponse response = search(queryBuilder,
Boolean.getBoolean(parameters.get(Parameter.SCROLL)) ? SearchType.SCAN
: SearchType.DFS_QUERY_THEN_FETCH);
Logger.trace("Got response: " + response);
final SearchHits hits = response.getHits();
final List<Document> docs = asDocuments(hits, fields(parameters));
final Pair<List<Document>, Long> result =
new ImmutablePair<>(docs, hits.getTotalHits());
Logger.debug(String.format("Got %s hits overall, created %s matching docs",
hits.getTotalHits(), docs.size()));
return result;
}
private List<String> fields(Map<Parameter, String> queries) {
return queries.keySet().stream()
.flatMap(p -> index.queries().get(p).fields().stream())
.collect(Collectors.toList());
}
/**
* Optional: specify a field to pick from the full result
*
* @param resultField The field to return as the result
* @return this search object (for chaining)
*/
public Search field(final String resultField) {
this.field = resultField;
return this;
}
/**
* Optional: specify a resource owner
*
* @param resourceOwner An ID for the owner of requested resources
* @return this search object (for chaining)
*/
public Search owner(final String resourceOwner) {
this.owner = resourceOwner;
return this;
}
/**
* Optional: specify a resource set
*
* @param resourceSet An ID for the set the requested resources should be in
* @return this search object (for chaining)
*/
public Search set(final String resourceSet) {
this.set = resourceSet;
return this;
}
/**
* Optional: specify the page size
*
* @param pageFrom The start index of the result set
* @param pageSize The size of the result set
* @return this search object (for chaining)
*/
public Search page(final int pageFrom, final int pageSize) {
this.from = pageFrom;
this.size = pageSize;
return this;
}
/**
* Optional: specify a type
*
* @param resourceType The type of the requested resources
* @return this search object (for chaining)
*/
public Search type(final String resourceType) {
this.type = resourceType;
return this;
}
/**
* Optional: specify a sort order
*
* @param sortOrder The sort order: newest, oldest
* @return this search object (for chaining)
*/
public Search sort(final String sortOrder) {
this.sort = sortOrder;
return this;
}
/**
* Optional: specify doing a scroll scan query
*
* @param scrollValue The value of the scroll parameter
* @return this search object (for chaining)
*/
public Search scroll(final String scrollValue) {
this.scroll = scrollValue;
return this;
}
/**
* @return number of hits of the query
*/
public long getTotalHits() {
return (startInitialResponse().getHits().getTotalHits());
}
/**
* @param request The clients request
* @param serialization The wanted serialization of the returned data.
* @return the chunks of the elasticsearch scroll scan query
*/
public Chunks<String> executeScrollScan(final Request request,
final Serialization serialization) {
validateSearchParameters();
return new StringChunks() {
@Override
public void onReady(Chunks.Out<String> out) {
setMessageOut(out);
ExecutorService executorService = Executors.newSingleThreadExecutor();
executorService.execute(new Runnable() {
@Override
public void run() {
doingScrollScanNow = true;
bulk(request, serialization);
}
});
executorService.shutdown();
}
};
}
/**
*
* @return if a scroll scan is done right now
*/
public boolean doingScrollScanNow() {
return doingScrollScanNow;
}
private void bulk(final Request request, final Serialization serialization) {
boolean JSON_LD = serialization.equals(Serialization.JSON_LD);
boolean RDF_XML = serialization.equals(Serialization.RDF_XML);
try {
long lastTime = Calendar.getInstance().getTimeInMillis();
SearchResponse searchResponse = startInitialResponse();
final SearchHits hits = getTotalHits(searchResponse);
if (JSON_LD)
getMessageOut().write("[");
if (RDF_XML)
getMessageOut().write("<root>");
long cnt = 0;
long to = hits.getTotalHits();
String str;
while ((str = getHitsAsString(request, searchResponse, JSON_LD, cnt, to,
serialization)) != null) {
if (JSON_LD)
getMessageOut().write(str.substring(1, str.length() - 1));
else
getMessageOut().write(str);
cnt = cnt + searchResponse.getHits().getHits().length;
Logger.info("Doc " + cnt + " ,sec:"
+ ((Calendar.getInstance().getTimeInMillis() - lastTime) / 1000));
searchResponse =
client.prepareSearchScroll(searchResponse.getScrollId())
.setScroll("1m").execute().actionGet();
// Getting is much faster than serving over http => memory fills up.
// So try to break down dynamically (lesser mem, more pausing).
// Eventually cancel if pauses took too long (10 secs).
long freeMem = Runtime.getRuntime().freeMemory();
// slow down if only 600 MB left
if (freeMem < 600 * 1024 * 1024) {
long sleep =
(long) Math.pow((1D / (freeMem / 1024D / 1024D / 1024D / 7D)), 2);
Logger.info("Free memory low: " + freeMem / 1024 / 1024
+ " MB, sleeping for " + sleep + " ms.");
if (sleep > 20000) {
Logger.warn("nMemory too low. Canceling request!");
getMessageOut().write(
"\nMemory too low. Canceling your request! Please contact 'semweb at hbz-nrw.de' or try again (probably some days) later.");
break;
}
Thread.sleep(sleep);
}
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
if (JSON_LD)
getMessageOut().write("\n]");
if (RDF_XML)
getMessageOut().write("</root>");
getMessageOut().close();
doingScrollScanNow = false;
Logger.info("Finished scroll scan dump");
}
}
private SearchHits getTotalHits(final SearchResponse response) {
final SearchHits hits = response.getHits();
final List<Document> docs = asDocuments(hits, fields(parameters));
Logger.info(String.format("Got %s hits overall, created %s matching docs",
hits.getTotalHits(), docs.size()));
return hits;
}
private SearchResponse startInitialResponse() {
QueryBuilder queryBuilder = createQuery();
Logger.trace("Using scroll query: " + queryBuilder);
SearchResponse response = search(queryBuilder, SearchType.SCAN);
Logger.trace("Got scroll response: " + response);
response = client.prepareSearchScroll(response.getScrollId())
.setScroll(TimeValue.timeValueMinutes(1)).execute().actionGet();
return response;
}
private String getHitsAsString(final Request request,
final SearchResponse response, final boolean JSON_LD, long cnt, long to,
Serialization serialization) {
if (response.getHits().getHits().length > 0 && cnt <= to) {
if (JSON_LD && cnt > 0)
getMessageOut().write(",\n");
return controllers.Application.getSerializedResult(
asDocuments(response.getHits(), fields(parameters)), index, field, to,
false, request, serialization);
}
return null;
}
/**
* @return The query object for this search
*/
public QueryBuilder createQuery() {
QueryBuilder queryBuilder = boolQueryFromParams();
if (!owner.isEmpty() && !owner.equals("*")) {
final QueryBuilder itemQuery = new LobidItems.OwnerQuery().build(owner);
queryBuilder = boolQuery().must(queryBuilder).must(itemQuery);
}
if (!set.isEmpty()) {
final QueryBuilder setQuery = new LobidResources.SetQuery().build(set);
queryBuilder = boolQuery().must(queryBuilder).must(setQuery);
}
String typeValues = AbstractIndexQuery.withoutBooleanOperator(type);
boolean isAndQuery = AbstractIndexQuery.isAndQuery(type);
if (!typeValues.isEmpty()) {
BoolQueryBuilder typeQuery = boolQuery();
for (String t : typeValues.split(",")) {
MatchQueryBuilder query = matchQuery("@graph.@type", t);
typeQuery =
isAndQuery ? typeQuery.must(query) : typeQuery.should(query);
}
queryBuilder = boolQuery().must(queryBuilder).must(typeQuery);
}
if (!scroll.isEmpty()) {
if (this.scroll.matches("\\d{8}")) {
final QueryBuilder changedSinceQuery =
new LobidResources.ChangedSinceQuery().build(this.scroll);
queryBuilder = boolQuery().must(queryBuilder).must(changedSinceQuery);
}
}
if (queryBuilder == null)
throw new IllegalStateException(String.format(
"Could not construct query for queries '%s', owner '%s'",
queryBuilder, owner));
return queryBuilder;
}
private QueryBuilder boolQueryFromParams() {
BoolQueryBuilder builder = boolQuery();
for (QueryBuilder q : getQueries()) {
builder = builder.must(q);
}
return builder;
}
private List<QueryBuilder> getQueries() {
List<QueryBuilder> res = new ArrayList<>();
for (Entry<Parameter, String> entry : parameters.entrySet()) {
res.add(index.queries().get(entry.getKey()).build(entry.getValue()));
}
return res;
}
private void validateSearchParameters() {
if (index == null) {
throw new IllegalArgumentException(String.format(
"Invalid index ('%s') - valid indexes: %s", index, Index.values()));
}
for (Entry<Parameter, String> entry : parameters.entrySet()) {
Parameter parameter = entry.getKey();
if (!index.queries().containsKey(parameter)) {
throw new IllegalArgumentException(String.format(
"Invalid parameter ('%s') for specified index ('%s') - valid: %s",
parameter, index, index.queries().keySet()));
}
}
if (from < 0) {
throw new IllegalArgumentException("Parameter 'from' must be positive");
}
if (size > 100) {
throw new IllegalArgumentException("Parameter 'size' must be <= 100");
}
final List<String> sortSupported = Arrays.asList("newest", "oldest");
if (!sort.isEmpty() && !sortSupported.contains(sort)) {
throw new IllegalArgumentException("Parameter 'sort' must be one of: "
+ Joiner.on(", ").join(sortSupported));
}
}
private SearchResponse search(final QueryBuilder queryBuilder,
SearchType searchType) {
SearchRequestBuilder requestBuilder = client.prepareSearch(index.id())
.setSearchType(searchType).setQuery(queryBuilder)
.setPostFilter(FilterBuilders.typeFilter(index.type()));
if (searchType.equals(SearchType.SCAN))
requestBuilder.setScroll(TimeValue.timeValueMinutes(1));
if (owner.equals("*"))
requestBuilder = requestBuilder.setPostFilter(FilterBuilders.existsFilter(//
"@graph.http://purl.org/vocab/frbr/core#exemplar.@id"));
if (!sort.isEmpty()) {
requestBuilder.addSort(SortBuilders
.fieldSort("@graph.http://purl.org/dc/terms/issued.@value")
.order(sort.equals("newest") ? SortOrder.DESC : SortOrder.ASC)
.ignoreUnmapped(true));
}
final SearchResponse response = requestBuilder.setFrom(from).setSize(size)
.setExplain(false).execute().actionGet();
return response;
}
private List<Document> asDocuments(final SearchHits hits,
final List<String> searchFields) {
final List<Document> res = new ArrayList<>();
for (SearchHit hit : hits) {
try {
Hit hitEnum = Hit.of(hit, searchFields);
final Document document =
new Document(hit.getId(), new String(hit.source()), index, field);
res.add(
hitEnum.process(parameters.values().iterator().next(), document));
} catch (IllegalArgumentException e) {
Logger.error(e.getMessage(), e);
}
}
final Predicate<Document> predicate = doc -> {
return doc.matchedField != null;
};
return ImmutableList.copyOf(Iterables.filter(res, predicate));
}
private Chunks.Out<String> getMessageOut() {
return messageOut;
}
private void setMessageOut(final Chunks.Out<String> messageOut) {
this.messageOut = messageOut;
}
}