//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers.utils;
import java.io.IOException;
import java.util.Map;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import uk.gov.dstl.baleen.uima.BaleenConsumer;
/**
* Abstract class for producing the objects required by the Elasticsearch consumers
*
* The schema used is as defined in {@link SingleDocumentConsumerFormat}}.
* A default mapping is created to avoid issues where ElasticSearch might guess the wrong type for a field
*
* Be aware that this schema is not compatible with that of Baleen 1, which is no longer supported.
*
* @baleen.javadoc
*/
public abstract class AbstractElasticsearchConsumer extends BaleenConsumer {
/**
* The Elasticsearch index to use
*
* @baleen.config baleen_index
*/
public static final String PARAM_INDEX = "index";
@ConfigurationParameter(name = PARAM_INDEX, defaultValue = "baleen_index")
protected String index;
/**
* The Elasticsearch type to use for documents inserted into the index
*
* @baleen.config baleen_output
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue = "baleen_output")
protected String type;
/**
* Should a hash of the content be used to generate the ID?
* If false, then a hash of the Source URI is used instead.
*
* @baleen.config true
*/
public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId";
@ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true")
boolean contentHashAsId = true;
private static final String ES_PROPERTIES = "properties";
private static final String ES_TYPE = "type";
private static final String ES_TYPE_STRING = "string";
private static final String ES_TYPE_INTEGER = "integer";
private static final String ES_TYPE_LONG = "long";
private static final String ES_TYPE_DOUBLE = "double";
private static final String ES_TYPE_GEOSHAPE = "geo_shape";
private static final String ES_TYPE_DATE = "date";
private static final String ES_TYPE_NESTED = "nested";
private IEntityConverterFields fields = new DefaultFields();
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
boolean indexCreated = createIndex();
if(indexCreated){
try{
addMapping(createMappingObject());
}catch(IOException ioe){
getMonitor().error("Unable to create mapping, you may get unexpected results in your Elasticsearch index", ioe);
}
}
}
/**
* Create an index in Elasticsearch.
* If necessary, this function should check whether a new index is required.
*
* @return true if a new index has been created, false otherwise
*/
public abstract boolean createIndex();
/**
* Add a mapping to Elasticsearch.
* This will only be called if a new index has been created
*/
public abstract void addMapping(XContentBuilder mapping);
/**
* Create a mapping for the new index
*/
private XContentBuilder createMappingObject() throws IOException{
// Just specify known non-String types and potential problem cases
return XContentFactory.jsonBuilder().startObject()
.startObject(type)
.startObject(ES_PROPERTIES)
.startObject("dateAccessed")
.field(ES_TYPE, ES_TYPE_LONG)
.endObject()
.startObject("metadata")
.field(ES_TYPE, ES_TYPE_NESTED)
.startObject(ES_PROPERTIES)
.startObject("value")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("key")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.endObject()
.endObject()
.startObject("entities")
.field(ES_TYPE, ES_TYPE_NESTED)
.startObject(ES_PROPERTIES)
.startObject("value")
.field(ES_TYPE, ES_TYPE_STRING)
.endObject()
.startObject("begin")
.field(ES_TYPE, ES_TYPE_INTEGER)
.endObject()
.startObject("end")
.field(ES_TYPE, ES_TYPE_INTEGER)
.endObject()
.startObject("confidence")
.field(ES_TYPE, ES_TYPE_DOUBLE)
.endObject()
.startObject("geoJson")
.field(ES_TYPE, ES_TYPE_GEOSHAPE)
.endObject()
.startObject("timestampStart")
.field(ES_TYPE, ES_TYPE_DATE)
.field("format", "epoch_second")
.endObject()
.startObject("timestampStop")
.field(ES_TYPE, ES_TYPE_DATE)
.field("format", "epoch_second")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject();
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
Map<String, Object> json = SingleDocumentConsumerFormat.formatCas(jCas, fields, contentHashAsId, getMonitor(), getSupport());
String id = (String) json.getOrDefault("externalId", "");
//Persist to ElasticSearch
addDocument(id, json);
}
/**
* Add the document (provided as JSON) to Elasticsearch, using the id provided.
*/
public abstract void addDocument(String id, Map <String, Object> json);
}