//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers.template;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.searchbox.client.JestClient;
import io.searchbox.client.JestResult;
import io.searchbox.core.Index;
import io.searchbox.indices.CreateIndex;
import io.searchbox.indices.IndicesExists;
import io.searchbox.indices.mapping.PutMapping;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import uk.gov.dstl.baleen.consumers.utils.ConsumerUtils;
import uk.gov.dstl.baleen.resources.SharedElasticsearchRestResource;
/**
* A RecordConsumer that writes extracted records documents to Elasticsearch.
* <p>
* Unlike the MongoDB consumer that writes all of the records for a source
* document as a single database document (with a records array), each record is
* written as a separate document.
* </p>
* <p>
* This stores the extracted records in an Elasticsearch index, specified using
* the index parameter, as the Elasticsearch type specified in the type
* parameter, into a shared Elasticsearch resource as supplied through the
* elasticsearchRest parameter. Document IDs are, by default, a hash of the
* document content but can be optionally configured to use the document source
* URI by setting the contentHashAsId parameter to false.
* </p>
*/
public class ElasticsearchTemplateRecordConsumer extends AbstractTemplateRecordConsumer {
/**
* Connection to Elasticsearch
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedElasticsearchRestResource
*/
public static final String KEY_ELASTICSEARCH_REST = "elasticsearchRest";
@ExternalResource(key = KEY_ELASTICSEARCH_REST)
private SharedElasticsearchRestResource esrResource;
/**
* The Elasticsearch index to use
*
* @baleen.config baleen_index
*/
public static final String PARAM_INDEX = "index";
@ConfigurationParameter(name = PARAM_INDEX, defaultValue = "baleen_record_index")
protected String index;
/**
* The Elasticsearch type to use for documents inserted into the index
*
* @baleen.config baleen_record
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue = "baleen_record")
protected String type;
/**
* Should a hash of the content be used to generate the ID? If false, then a
* hash of the Source URI is used instead.
*
* @baleen.config true
*/
public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId";
@ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true")
private boolean contentHashAsId;
/**
* The object mapper.
*/
private ObjectMapper mapper;
private static final String ES_PROPERTIES = "properties";
private static final String ES_TYPE = "type";
private static final String ES_TYPE_STRING = "string";
private static final String ES_TYPE_NESTED = "nested";
@Override
protected void writeRecords(JCas jCas, String documentSourceName,
Map<String, Collection<ExtractedRecord>> extractedRecords) throws AnalysisEngineProcessException {
String externalId = getUniqueId(jCas);
List<ElasticsearchExtractedRecord> elasticSearchRecords = extractedRecords.entrySet().stream()
.flatMap(entry -> {
String sourceUri = entry.getKey();
return entry.getValue().stream()
.map(r -> new ElasticsearchExtractedRecord(externalId, sourceUri, r));
}).collect(Collectors.toList());
for (ElasticsearchExtractedRecord elasticsearchExtractedRecord : elasticSearchRecords) {
String json;
try {
json = mapper.writeValueAsString(elasticsearchExtractedRecord);
} catch (JsonProcessingException e) {
getMonitor().warn("Failed to serialise record for Elasticsearch - skipping", e);
continue;
}
Index doc = new Index.Builder(json)
.id(String.format("%s-%s", externalId, elasticsearchExtractedRecord.getName())).index(index)
.type(type).build();
try {
esrResource.getClient().execute(doc);
} catch (IOException e) {
getMonitor().warn("Failed to index document in Elasticsearch for index " + index, e);
}
}
}
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
mapper = new ObjectMapper();
mapper.setSerializationInclusion(Include.NON_NULL);
boolean indexCreated = createIndex();
if (indexCreated) {
try {
XContentBuilder createMappingObject = createMappingObject();
addMapping(createMappingObject);
} catch (IOException ioe) {
getMonitor().error(
"Unable to create mapping, you may get unexpected results in your Elasticsearch index", ioe);
}
}
}
/**
* Create an index in Elasticsearch. If necessary, this function should
* check whether a new index is required.
*
* @return true if a new index has been created, false otherwise
*/
public boolean createIndex() {
JestClient client = esrResource.getClient();
try {
JestResult result = client.execute(new IndicesExists.Builder(index).build());
if (result.getResponseCode() != 200) {
client.execute(new CreateIndex.Builder(index).build());
return true;
}
} catch (IOException ioe) {
getMonitor().error("Unable to create index", ioe);
}
return false;
}
/**
* Add a mapping to Elasticsearch. This will only be called if a new index
* has been created
*/
public void addMapping(XContentBuilder mapping) {
try {
PutMapping putMapping = new PutMapping.Builder(index, type, mapping.string()).build();
esrResource.getClient().execute(putMapping);
} catch (IOException ioe) {
getMonitor().error("Unable to add mapping to index", ioe);
}
}
/**
* Create a mapping for the new index
*/
private XContentBuilder createMappingObject() throws IOException {
return XContentFactory.jsonBuilder()
.startObject()
.startObject(type)
.startObject(ES_PROPERTIES)
.startObject("externalId")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("sourceUri")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("kind")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("name")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("fields")
.field(ES_TYPE, ES_TYPE_NESTED)
.startObject(ES_PROPERTIES)
.startObject("name")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("value")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject();
}
/**
* Gets the unique id for a document (if contentHashAsId is true then as
* hash of the content is used, otherwise a hash of the source URI is used).
*
* @param jCas
* the JCas
* @return the unique id
*/
private String getUniqueId(JCas jCas) {
return ConsumerUtils.getExternalId(getDocumentAnnotation(jCas), contentHashAsId);
}
}