//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.BsonSerializationException;
import org.bson.Document;
import com.google.common.collect.Multimap;
import com.google.common.collect.MultimapBuilder;
import com.mongodb.MongoException;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import uk.gov.dstl.baleen.consumers.utils.ConsumerUtils;
import uk.gov.dstl.baleen.consumers.utils.DefaultFields;
import uk.gov.dstl.baleen.consumers.utils.EntityRelationConverter;
import uk.gov.dstl.baleen.consumers.utils.IEntityConverterFields;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.types.metadata.PublishedId;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.types.semantic.Relation;
import uk.gov.dstl.baleen.uima.BaleenConsumer;
import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils;
/**
* Output processed CAS object into MongoDB.
*
* <p>This consumer will output to Mongo using a schema which consists of 3 collections with the formats described below.
* For each CAS processed, any existing reference to a document with the same external ID is deleted.</p>
* <p><b>documents</b></p>
* <pre>
{
document: {
type,
source,
language,
ts,
classification,
caveats: [],
releasability: []
},
publishedIds: [],
metadata: {
key: [value, ...],
...
},
content,
externalId
}
* </pre>
*
* <p><b>entities</b></p>
* <p>Entities are grouped by their reference target, so all the entities in one Mongo document refer to the same thing.
* Additional fields may be present depending on the entity type.</p>
* <pre>
{
docId,
entities: [
{
confidence,
externalId,
begin,
end,
type,
value,
...
}
]
}
* </pre>
*
* <p><b>relations</b></p>
* <p>Relations link two entities that are stored in the <em>entities</em> collection, which are referred to by their externalId.</p>
* Additional fields may be present depending on the relation type.</p>
* <pre>
{
docId,
source,
target,
begin,
end,
type,
relationshipType,
relationSubtype,
value,
confidence,
...
}
* </pre>
*
*
* @baleen.javadoc
*/
public class Mongo extends BaleenConsumer {
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
private SharedMongoResource mongoResource;
/**
* Should a hash of the content be used to generate the ID?
* If false, then a hash of the Source URI is used instead.
*
* @baleen.config true
*/
public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId";
@ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true")
private boolean contentHashAsId = true;
/**
* Should we output the history to Mongo?
*
* @baleen.config false
*/
public static final String PARAM_OUTPUT_HISTORY = "outputHistory";
@ConfigurationParameter(name = PARAM_OUTPUT_HISTORY, defaultValue = "false")
private boolean outputHistory = false;
/**
* The collection to output entities to
*
* @baleen.config entities
*/
public static final String PARAM_ENTITIES_COLLECTION = "entities";
@ConfigurationParameter(name = PARAM_ENTITIES_COLLECTION, defaultValue = "entities")
private String entitiesCollectionName;
/**
* The collection to output relationships to
*
* @baleen.config relations
*/
public static final String PARAM_RELATIONS_COLLECTION = "relations";
@ConfigurationParameter(name = PARAM_RELATIONS_COLLECTION, defaultValue = "relations")
private String relationsCollectionName;
/**
* The collection to output documents to
*
* @baleen.config documents
*/
public static final String PARAM_DOCUMENTS_COLLECTION = "documents";
@ConfigurationParameter(name = PARAM_DOCUMENTS_COLLECTION, defaultValue = "documents")
private String documentsCollectionName;
/**
* Should we output the document content to Mongo?
*
* @baleen.config true
*/
public static final String PARAM_OUTPUT_CONTENT = "outputContent";
@ConfigurationParameter(name = PARAM_OUTPUT_CONTENT, defaultValue = "true")
private boolean outputContent = false;
private MongoCollection<Document> entitiesCollection;
private MongoCollection<Document> relationsCollection;
private MongoCollection<Document> documentsCollection;
/**
* Holds the types of features that we're not interested in persisting (stuff from UIMA for example)
* We're storing these so that we can loop through the features (and then ignore some of them)
*/
private Set<String> stopFeatures;
//Fields
public static final String FIELD_DOCUMENT_ID = "docId";
public static final String FIELD_ENTITIES = "entities";
public static final String FIELD_DOCUMENT = "document";
public static final String FIELD_DOCUMENT_TYPE = "type";
public static final String FIELD_DOCUMENT_SOURCE = "source";
public static final String FIELD_DOCUMENT_LANGUAGE = "language";
public static final String FIELD_DOCUMENT_TIMESTAMP = "timestamp";
public static final String FIELD_DOCUMENT_CLASSIFICATION = "classification";
public static final String FIELD_DOCUMENT_CAVEATS = "caveats";
public static final String FIELD_DOCUMENT_RELEASABILITY = "releasability";
public static final String FIELD_PUBLISHEDIDS = "publishedIds";
public static final String FIELD_PUBLISHEDIDS_ID = "id";
public static final String FIELD_PUBLISHEDIDS_TYPE = "type";
public static final String FIELD_METADATA = "metadata";
public static final String FIELD_CONTENT = "content";
private final IEntityConverterFields fields = new DefaultFields();
/**
* Get the mongo db, collection and create some indexes
*/
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
MongoDatabase db = mongoResource.getDB();
entitiesCollection = db.getCollection(entitiesCollectionName);
relationsCollection = db.getCollection(relationsCollectionName);
documentsCollection = db.getCollection(documentsCollectionName);
documentsCollection.createIndex(new Document(fields.getExternalId(), 1));
entitiesCollection.createIndex(new Document(fields.getExternalId(), 1));
relationsCollection.createIndex(new Document(fields.getExternalId(), 1));
relationsCollection.createIndex(new Document(FIELD_DOCUMENT_ID, 1));
entitiesCollection.createIndex(new Document(FIELD_DOCUMENT_ID, 1));
stopFeatures = new HashSet<>();
stopFeatures.add("uima.cas.AnnotationBase:sofa");
stopFeatures.add("uk.gov.dstl.baleen.types.BaleenAnnotation:internalId");
}
@Override
public void doDestroy() {
entitiesCollection = null;
relationsCollection = null;
documentsCollection = null;
}
protected String getUniqueId(JCas jCas) {
return ConsumerUtils.getExternalId(getDocumentAnnotation(jCas), contentHashAsId);
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
String documentId = getUniqueId(jCas);
// Delete any existing content in the database
deleteAllContent(documentId);
// Save
try{
saveDocument(documentId, jCas);
}catch(MongoException | BsonSerializationException e){
getMonitor().error("Unable to persist document to database - document {} will be skipped", getDocumentAnnotation(jCas).getSourceUri(), e);
return;
}
try{
saveEntities(documentId, jCas);
}catch(MongoException | BsonSerializationException e){
getMonitor().error("Unable to persist entities to database - document {} will contain no entities", getDocumentAnnotation(jCas).getSourceUri(), e);
}
try{
saveRelations(documentId, jCas);
}catch(MongoException | BsonSerializationException e){
getMonitor().error("Unable to persist relations to database - document {} will contain no relations", getDocumentAnnotation(jCas).getSourceUri(), e);
}
}
private void deleteAllContent(String documentId) {
entitiesCollection.deleteMany(new Document(FIELD_DOCUMENT_ID, documentId));
relationsCollection.deleteMany(new Document(FIELD_DOCUMENT_ID, documentId));
documentsCollection.deleteMany(new Document(fields.getExternalId(), documentId));
}
private void saveDocument(String documentId, JCas jCas) {
Document doc = new Document();
// document level
DocumentAnnotation da = getDocumentAnnotation(jCas);
doc.append(FIELD_DOCUMENT, new Document()
.append(FIELD_DOCUMENT_TYPE, da.getDocType())
.append(FIELD_DOCUMENT_SOURCE, da.getSourceUri())
.append(FIELD_DOCUMENT_LANGUAGE, da.getLanguage())
.append(FIELD_DOCUMENT_TIMESTAMP, new Date(da.getTimestamp()))
.append(FIELD_DOCUMENT_CLASSIFICATION, da.getDocumentClassification())
.append(FIELD_DOCUMENT_CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats()))
.append(FIELD_DOCUMENT_RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability()))
);
// Published Ids
List<Document> publishedIds = new ArrayList<>();
for(PublishedId pid : JCasUtil.select(jCas, PublishedId.class)) {
publishedIds.add(new Document(FIELD_PUBLISHEDIDS_TYPE, pid.getPublishedIdType())
.append(FIELD_PUBLISHEDIDS_ID, pid.getValue()));
}
doc.append(FIELD_PUBLISHEDIDS, publishedIds);
// Meta data
Multimap<String,Object> meta = MultimapBuilder.linkedHashKeys().linkedListValues().build();
for(Metadata metadata : JCasUtil.select(jCas, Metadata.class)) {
String key = metadata.getKey();
if(key.contains(".")){ //Field names can't contain a "." in Mongo, so replace with a _
key = key.replaceAll("\\.", "_");
}
meta.put(key, metadata.getValue());
}
doc.append(FIELD_METADATA, meta.asMap());
// Add content is requried
if(outputContent) {
doc.append(FIELD_CONTENT, jCas.getDocumentText());
}
// Save
doc.append(fields.getExternalId(), documentId);
documentsCollection.insertOne(doc);
}
private void saveEntities(String documentId, JCas jCas) {
EntityRelationConverter converter = new EntityRelationConverter(getMonitor(), outputHistory, getSupport().getDocumentHistory(jCas), stopFeatures, fields);
// Compile all the reference targets together
Multimap<ReferenceTarget, Entity> targetted = MultimapBuilder.hashKeys().linkedListValues().build();
for(Entity entity : JCasUtil.select(jCas, Entity.class)) {
if( entity.getReferent() != null ) {
targetted.put(entity.getReferent(), entity);
} else {
// Create a fake reference target
targetted.put(new ReferenceTarget(jCas), entity);
}
}
List<Document> ents = targetted.asMap().entrySet().stream().map(e -> {
Document doc = new Document();
doc.append(FIELD_DOCUMENT_ID, documentId);
doc.append(FIELD_ENTITIES,
e.getValue().stream().map(ent ->
converter.convertEntity(ent)).collect(Collectors.toList())
);
return doc;
}).collect(Collectors.toList());
if(!ents.isEmpty())
entitiesCollection.insertMany(ents);
}
private void saveRelations(String documentId, JCas jCas) {
EntityRelationConverter converter = new EntityRelationConverter(getMonitor(), outputHistory, getSupport().getDocumentHistory(jCas), stopFeatures, fields);
List<Document> rels = JCasUtil.select(jCas, Relation.class).stream()
.map(r -> new Document(converter.convertRelation(r)).append(FIELD_DOCUMENT_ID, documentId))
.collect(Collectors.toList());
if(!rels.isEmpty())
relationsCollection.insertMany(rels);
}
}