AssignTypeToInteraction.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.interactions;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Stream;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.Document;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.mongodb.client.MongoCollection;

import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
import uk.gov.dstl.baleen.annotators.patterns.data.InteractionTypeDefinition;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.types.language.Interaction;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.utils.ComparableEntitySpanUtils;

/**
 * Assign relation type and subtype to interaction.
 *
 * The mongo gazetteers only allow a word to map to to one annotation type (eg "City of London" maps
 * to only one London in the world). For interactions that is not sufficient, we need attack (noun)
 * and attack (verb) to map to potentially different interactions.
 *
 * So for interactions we use the gazetteers to assign the interaction annotators. This annotator
 * then reviews that assignment and:
 * <ul>
 * <li>Removes annotations where the part of speech don't match
 * <li>Adds information to interaction annotation such as the type and subtype
 * <li>Duplicates annotations which have multiple senses
 * </ul>
 *
 * This annotator uses to relationTypes collection generated by {@link UploadInteractionsToMongo}.
 * Note that using this annotator may be optional is you have a very simple set of interaction words
 * (without overlaps, which the gazetteers may cope with).
 *
 * @baleen.javadoc
 *
 */
public class AssignTypeToInteraction extends BaleenAnnotator {
	/**
	 * Connection to Mongo
	 *
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
	 */
	public static final String KEY_MONGO = "mongo";
	@ExternalResource(key = KEY_MONGO)
	private SharedMongoResource mongo;

	/**
	 * The name of the Mongo collection containing the relation types
	 *
	 * @baleen.config gazetteer
	 */
	public static final String PARAM_COLLECTION = "collection";
	@ConfigurationParameter(name = PARAM_COLLECTION, defaultValue = "relationTypes")
	private String collection;

	/**
	 * The name of the field in Mongo that contains the relation type
	 *
	 * @baleen.config type
	 */
	public static final String PARAM_TYPE_FIELD = "typeField";
	@ConfigurationParameter(name = PARAM_TYPE_FIELD, defaultValue = "type")
	private String typeField;

	/**
	 * The name of the field in Mongo that contains the relation sub type
	 *
	 * @baleen.config type
	 */
	public static final String PARAM_SUBTYPE_FIELD = "subTypeField";
	@ConfigurationParameter(name = PARAM_SUBTYPE_FIELD, defaultValue = "subType")
	private String subTypeField;

	/**
	 * The name of the field in Mongo that contains the relation source type
	 *
	 * @baleen.config source
	 */
	public static final String PARAM_SOURCE_FIELD = "sourceField";
	@ConfigurationParameter(name = PARAM_SOURCE_FIELD, defaultValue = "source")
	private String sourceField;

	/**
	 * The name of the field in Mongo that contains the relation source type
	 *
	 * @baleen.config target
	 */
	public static final String PARAM_TARGET_FIELD = "targetField";
	@ConfigurationParameter(name = PARAM_TARGET_FIELD, defaultValue = "target")
	private String targetField;

	/**
	 * The name of the field in Mongo that contains the relation pos
	 *
	 * @baleen.config posField pos
	 */
	public static final String PARAM_POS_FIELD = "posField";
	@ConfigurationParameter(name = PARAM_POS_FIELD, defaultValue = "pos")
	private String posField;

	/**
	 * The name of the field in Mongo that contains the relation values
	 *
	 * @baleen.config posField pos
	 */
	public static final String PARAM_VALUES_FIELD = "valueField";
	@ConfigurationParameter(name = PARAM_VALUES_FIELD, defaultValue = "value")
	private String valuesField;

	/**
	 * The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum
	 *
	 * @baleen.config ENGLISH
	 */
	public static final String PARAM_ALGORITHM = "algorithm";
	@ConfigurationParameter(name = PARAM_ALGORITHM, defaultValue = "ENGLISH")
	protected String algorithm;

	/**
	 * Should the words be stemmed before processing?
	 *
	 * Set false if you want a very precise match against your values, effectively they must be the
	 * interaction values. Set to true for a more relaxed match but which might produce false
	 * positives.
	 *
	 * @baleen.config true
	 */
	public static final String PARAM_STEM = "stem";
	@ConfigurationParameter(name = PARAM_STEM, defaultValue = "true")
	protected boolean stem;

	private final Multimap<String, InteractionTypeDefinition> definitions = HashMultimap.create();
	private SnowballStemmer stemmer;

	@Override
	public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
		super.doInitialize(aContext);

		
		ALGORITHM algo;
		try{
			algo = ALGORITHM.valueOf(algorithm);
		}catch(IllegalArgumentException iae){
			getMonitor().warn("Algorithm {} doesn't exist, defaulting to ENGLISH", algorithm, iae);
			algo = ALGORITHM.ENGLISH;
		}
		stemmer = new SnowballStemmer(algo);

		final MongoCollection<Document> dbCollection = mongo.getDB().getCollection(collection);

		for(Document o : dbCollection.find()){
			String type = (String) o.get(typeField);
			String subType = (String) o.get(subTypeField);
			String pos = (String) o.get(posField);
			List<?> values = (List<?>) o.get(valuesField);

			InteractionTypeDefinition definition = new InteractionTypeDefinition(type, subType, pos);

			values.stream()
					.filter(s -> s instanceof String)
					.forEach(s -> {
						String key = toKey(definition.getPos(), (String) s);
						definitions.put(key, definition);
					});
		}
	}

	private String toKey(String pos, String word) {
		CharSequence normalised = word.toLowerCase().trim();
		if (stem) {
			normalised = stemmer.stem(normalised);
		}
		return String.format("%s:%s", Character.toLowerCase(pos.charAt(0)), normalised);
	}

	@Override
	protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
		Map<Interaction, Collection<WordToken>> interactionToWords = JCasUtil.indexCovered(jCas, Interaction.class,
				WordToken.class);

		Collection<Interaction> allInteractions = new ArrayList<>(JCasUtil.select(jCas, Interaction.class));
		for (Interaction interaction : allInteractions) {
			String value = interaction.getCoveredText();
			Collection<WordToken> words = interactionToWords.get(interaction);

			if (words != null && !words.isEmpty() && value != null && !value.isEmpty()) {
				// So we have the covered words and the interaction value (ie the word covered by
				// the interact)

				// Look for a string match between the interaction value and the words then find all
				// the potential POS it could be

				Stream<String> keys = words.stream()
						.filter(p -> p.getCoveredText().equalsIgnoreCase(value))
						.map(w -> w.getPartOfSpeech())
						.distinct()
						.filter(Objects::nonNull)
						.map(p -> toKey(p, value));

				// For each interaction we create a new interaction which is has the right type info

				// This get does POS matching for us
				keys.map(definitions::get)
						.filter(l -> l != null && !l.isEmpty())
						.flatMap(Collection::stream)
						.forEach(d -> {
							Interaction i = ComparableEntitySpanUtils.copyInteraction(jCas, interaction.getBegin(),
									interaction.getEnd(), interaction);

							i.setRelationshipType(d.getType());
							i.setRelationSubType(d.getSubType());

							addToJCasIndex(i);
						});
			}
		}

		// Delete the old interaction, its either been replaced or not
		removeFromJCasIndex(allInteractions);
	}
	
	@Override
	public AnalysisEngineAction getAction() {
		return new AnalysisEngineAction(ImmutableSet.of(WordToken.class, Interaction.class), Collections.emptySet());
	}
}