//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.template; import java.util.Collection; import java.util.Map; import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.bson.Document; import uk.gov.dstl.baleen.consumers.utils.ConsumerUtils; import uk.gov.dstl.baleen.resources.SharedMongoResource; /** * A RecordConsumer that writes RecordsDocuments to MongoDB. * * This stores the extracted records in a MongoDB collection, specified using * the records configuration parameter, using a shared Mongo resource as * supplied through the mongo configuration parameter. Document IDs are, by * default, a hash of the document content but can be optionally configured to * use the document source URI by setting the contentHashAsId parameter to * false. */ public class MongoTemplateRecordConsumer extends AbstractTemplateRecordConsumer { /** * Connection to Mongo * * @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource */ public static final String KEY_MONGO = "mongo"; @ExternalResource(key = KEY_MONGO) private SharedMongoResource mongoResource; /** * Should a hash of the content be used to generate the ID? If false, then a * hash of the Source URI is used instead. * * @baleen.config true */ public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId"; @ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true") private boolean contentHashAsId = true; /** * The collection to output records to. * * @baleen.config records */ public static final String PARAM_RECORDS_COLLECTION = "records"; @ConfigurationParameter(name = PARAM_RECORDS_COLLECTION, defaultValue = "records") private String recordsCollectionName; private MongoCollection<Document> recordsCollection; private ObjectMapper objectMapper; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); MongoDatabase db = mongoResource.getDB(); recordsCollection = db.getCollection(recordsCollectionName); objectMapper = new ObjectMapper(); objectMapper.setSerializationInclusion(Include.NON_NULL); } @Override protected void doDestroy() { recordsCollection = null; } @Override protected void writeRecords(JCas jCas, String documentSourceName, Map<String, Collection<ExtractedRecord>> records) throws AnalysisEngineProcessException { MongoExtractedRecords mongoRecords = new MongoExtractedRecords(getUniqueId(jCas), documentSourceName, records); save(mongoRecords); } /** * Write records to MongoDB. * * @param mongoRecords * the mongo records */ private void save(MongoExtractedRecords mongoRecords) { Document document; try { document = createMongoDocument(mongoRecords); } catch (JsonProcessingException e) { getMonitor().warn("Failed to serialise records for Mongo", e); return; } recordsCollection.insertOne(document); } /** * Creates a mongo document pojo for serialisation. * * @param mongoRecords * the mongo records * @return the document * @throws JsonProcessingException * the json processing exception */ private Document createMongoDocument(MongoExtractedRecords mongoRecords) throws JsonProcessingException { String json = objectMapper.writeValueAsString(mongoRecords); return Document.parse(json); } /** * Gets the unique id for a document (if contentHashAsId is true then as * hash of the content is used, otherwise a hash of the source URI is used). * * @param jCas * the JCas * @return the unique id */ private String getUniqueId(JCas jCas) { return ConsumerUtils.getExternalId(getDocumentAnnotation(jCas), contentHashAsId); } }