//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.interactions;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Stream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.Document;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.mongodb.client.MongoCollection;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
import uk.gov.dstl.baleen.annotators.patterns.data.InteractionTypeDefinition;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.types.language.Interaction;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.utils.ComparableEntitySpanUtils;
/**
* Assign relation type and subtype to interaction.
*
* The mongo gazetteers only allow a word to map to to one annotation type (eg "City of London" maps
* to only one London in the world). For interactions that is not sufficient, we need attack (noun)
* and attack (verb) to map to potentially different interactions.
*
* So for interactions we use the gazetteers to assign the interaction annotators. This annotator
* then reviews that assignment and:
* <ul>
* <li>Removes annotations where the part of speech don't match
* <li>Adds information to interaction annotation such as the type and subtype
* <li>Duplicates annotations which have multiple senses
* </ul>
*
* This annotator uses to relationTypes collection generated by {@link UploadInteractionsToMongo}.
* Note that using this annotator may be optional is you have a very simple set of interaction words
* (without overlaps, which the gazetteers may cope with).
*
* @baleen.javadoc
*
*/
public class AssignTypeToInteraction extends BaleenAnnotator {
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
private SharedMongoResource mongo;
/**
* The name of the Mongo collection containing the relation types
*
* @baleen.config gazetteer
*/
public static final String PARAM_COLLECTION = "collection";
@ConfigurationParameter(name = PARAM_COLLECTION, defaultValue = "relationTypes")
private String collection;
/**
* The name of the field in Mongo that contains the relation type
*
* @baleen.config type
*/
public static final String PARAM_TYPE_FIELD = "typeField";
@ConfigurationParameter(name = PARAM_TYPE_FIELD, defaultValue = "type")
private String typeField;
/**
* The name of the field in Mongo that contains the relation sub type
*
* @baleen.config type
*/
public static final String PARAM_SUBTYPE_FIELD = "subTypeField";
@ConfigurationParameter(name = PARAM_SUBTYPE_FIELD, defaultValue = "subType")
private String subTypeField;
/**
* The name of the field in Mongo that contains the relation source type
*
* @baleen.config source
*/
public static final String PARAM_SOURCE_FIELD = "sourceField";
@ConfigurationParameter(name = PARAM_SOURCE_FIELD, defaultValue = "source")
private String sourceField;
/**
* The name of the field in Mongo that contains the relation source type
*
* @baleen.config target
*/
public static final String PARAM_TARGET_FIELD = "targetField";
@ConfigurationParameter(name = PARAM_TARGET_FIELD, defaultValue = "target")
private String targetField;
/**
* The name of the field in Mongo that contains the relation pos
*
* @baleen.config posField pos
*/
public static final String PARAM_POS_FIELD = "posField";
@ConfigurationParameter(name = PARAM_POS_FIELD, defaultValue = "pos")
private String posField;
/**
* The name of the field in Mongo that contains the relation values
*
* @baleen.config posField pos
*/
public static final String PARAM_VALUES_FIELD = "valueField";
@ConfigurationParameter(name = PARAM_VALUES_FIELD, defaultValue = "value")
private String valuesField;
/**
* The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum
*
* @baleen.config ENGLISH
*/
public static final String PARAM_ALGORITHM = "algorithm";
@ConfigurationParameter(name = PARAM_ALGORITHM, defaultValue = "ENGLISH")
protected String algorithm;
/**
* Should the words be stemmed before processing?
*
* Set false if you want a very precise match against your values, effectively they must be the
* interaction values. Set to true for a more relaxed match but which might produce false
* positives.
*
* @baleen.config true
*/
public static final String PARAM_STEM = "stem";
@ConfigurationParameter(name = PARAM_STEM, defaultValue = "true")
protected boolean stem;
private final Multimap<String, InteractionTypeDefinition> definitions = HashMultimap.create();
private SnowballStemmer stemmer;
@Override
public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
ALGORITHM algo;
try{
algo = ALGORITHM.valueOf(algorithm);
}catch(IllegalArgumentException iae){
getMonitor().warn("Algorithm {} doesn't exist, defaulting to ENGLISH", algorithm, iae);
algo = ALGORITHM.ENGLISH;
}
stemmer = new SnowballStemmer(algo);
final MongoCollection<Document> dbCollection = mongo.getDB().getCollection(collection);
for(Document o : dbCollection.find()){
String type = (String) o.get(typeField);
String subType = (String) o.get(subTypeField);
String pos = (String) o.get(posField);
List<?> values = (List<?>) o.get(valuesField);
InteractionTypeDefinition definition = new InteractionTypeDefinition(type, subType, pos);
values.stream()
.filter(s -> s instanceof String)
.forEach(s -> {
String key = toKey(definition.getPos(), (String) s);
definitions.put(key, definition);
});
}
}
private String toKey(String pos, String word) {
CharSequence normalised = word.toLowerCase().trim();
if (stem) {
normalised = stemmer.stem(normalised);
}
return String.format("%s:%s", Character.toLowerCase(pos.charAt(0)), normalised);
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
Map<Interaction, Collection<WordToken>> interactionToWords = JCasUtil.indexCovered(jCas, Interaction.class,
WordToken.class);
Collection<Interaction> allInteractions = new ArrayList<>(JCasUtil.select(jCas, Interaction.class));
for (Interaction interaction : allInteractions) {
String value = interaction.getCoveredText();
Collection<WordToken> words = interactionToWords.get(interaction);
if (words != null && !words.isEmpty() && value != null && !value.isEmpty()) {
// So we have the covered words and the interaction value (ie the word covered by
// the interact)
// Look for a string match between the interaction value and the words then find all
// the potential POS it could be
Stream<String> keys = words.stream()
.filter(p -> p.getCoveredText().equalsIgnoreCase(value))
.map(w -> w.getPartOfSpeech())
.distinct()
.filter(Objects::nonNull)
.map(p -> toKey(p, value));
// For each interaction we create a new interaction which is has the right type info
// This get does POS matching for us
keys.map(definitions::get)
.filter(l -> l != null && !l.isEmpty())
.flatMap(Collection::stream)
.forEach(d -> {
Interaction i = ComparableEntitySpanUtils.copyInteraction(jCas, interaction.getBegin(),
interaction.getEnd(), interaction);
i.setRelationshipType(d.getType());
i.setRelationSubType(d.getSubType());
addToJCasIndex(i);
});
}
}
// Delete the old interaction, its either been replaced or not
removeFromJCasIndex(allInteractions);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(WordToken.class, Interaction.class), Collections.emptySet());
}
}