//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.Document;
import com.mongodb.client.MongoCollection;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.types.Base;
import uk.gov.dstl.baleen.types.language.Pattern;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenConsumer;
/**
* Saves patterns in a JCas to a Mongo collection.
* <p>
* Use this after a pattern extraction annotator (eg PatternExtractor) in order to create a training
* set in Mongo for offline analysis. In other words this annotator will save UIMA Pattern types to
* Mongo.
* <p>
* This will clear the existing collection, unless clear = false is set as configuration parameter.
* <p>
* Note this is BaleenConsumer but, like all consumers, it can be used as annotator. So if you wish
* to save midway through a pipeline, clear the result and then create more patterns that is
* supported with Baleen.
*
* @baleen.javadoc
*/
public class MongoPatternSaver extends BaleenConsumer {
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
private SharedMongoResource mongo;
/**
* The name of the Mongo collection to hold the patterns
*
* @baleen.config patterns
*/
public static final String KEY_COLLECTION = "collection";
@ConfigurationParameter(name = KEY_COLLECTION, defaultValue = "patterns")
private String collection;
/**
* Clears the output pattern collection before saving.
*
* @baleen.config true
*/
public static final String KEY_CLEAR = "clear";
@ConfigurationParameter(name = KEY_CLEAR, defaultValue = "true")
private Boolean clear;
private MongoCollection<Document> dbCollection;
@Override
public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
dbCollection = mongo.getDB().getCollection(collection);
// Delete the whole database
if (clear) {
dbCollection.deleteMany(new Document());
}
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
List<Document> patterns = new ArrayList<>();
for (final Pattern pattern : JCasUtil.select(jCas, Pattern.class)) {
final Base source = pattern.getSource();
final Base target = pattern.getTarget();
if (source instanceof Entity && target instanceof Entity) {
final Document object = new Document()
.append("source", saveEntity((Entity) source))
.append("target", saveEntity((Entity) target))
.append("words", saveWords(pattern));
patterns.add(object);
}
}
if(!patterns.isEmpty())
dbCollection.insertMany(patterns);
}
/**
* Save words.
*
* @param pattern
* the pattern
* @return the DB object
*/
private List<Object> saveWords(final Pattern pattern) {
final List<Object> list = new ArrayList<>();
for (int i = 0; i < pattern.getWords().size(); i++) {
final WordToken w = pattern.getWords(i);
final Document o = new Document()
.append("text", w.getCoveredText())
.append("pos", w.getPartOfSpeech());
if (w.getLemmas() != null && w.getLemmas().size() >= 1) {
o.put("lemma", w.getLemmas(0).getLemmaForm());
}
list.add(o);
}
return list;
}
/**
* Save entity.
*
* @param entity
* the entity
* @return the DB object
*/
private Document saveEntity(final Entity entity) {
return new Document()
.append("text", entity.getCoveredText())
.append("type", entity.getTypeName());
}
}