IdentifyInteractions.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.jobs.interactions;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.Document;

import com.google.common.base.Strings;
import com.mongodb.client.MongoCollection;

import uk.gov.dstl.baleen.jobs.interactions.data.InteractionWord;
import uk.gov.dstl.baleen.jobs.interactions.data.PatternReference;
import uk.gov.dstl.baleen.jobs.interactions.data.Word;
import uk.gov.dstl.baleen.jobs.interactions.impl.InteractionIdentifier;
import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionWriter;
import uk.gov.dstl.baleen.jobs.interactions.io.InteractionWriter;
import uk.gov.dstl.baleen.jobs.interactions.io.MonitorInteractionWriter;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.resources.SharedWordNetResource;
import uk.gov.dstl.baleen.resources.utils.WordNetUtils;
import uk.gov.dstl.baleen.uima.BaleenTask;
import uk.gov.dstl.baleen.uima.JobSettings;

/**
 * Identify interaction words based on a Mongo collection of patterns.
 * <p>
 * This requires a Wordnet dictionary and a Mongo resource (to read from). The Mongo collection
 * should hold patterns which have been extracted by a pipeline containing {@Link MongoPatternSaver}
 * <p>
 * See {@link InteractionIdentifier} for more details of the implementation.
 * <p>
 * The relationship types are based on Wordnet supersenses (meaning the original file in which the
 * word is defined). This provides a group of around 40 definitions.
 * <p>
 * The output of this process is a CSV (format defined by {@link CsvInteractionWriter}.
 *
 * <pre>
 * mongo:
 *   db: baleen
 *   host: localhost
 *
 * job:
 *   tasks:
 *   - class: interactions.IdentifyInteractions
 *     filename: output/interactions.csv
 * </pre>
 *
 * Typically you will want to edit / review the CSV file, then run {@link EnhanceInteractions} and
 * then {@link UploadInteractionsToMongo}.
 *
 * @baleen.javadoc
 */
public class IdentifyInteractions extends BaleenTask {

	/**
	 * Connection to Mongo
	 *
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
	 */
	public static final String KEY_MONGO = "mongo";
	@ExternalResource(key = KEY_MONGO)
	private SharedMongoResource mongo;

	/**
	 * Connection to Wordnet
	 *
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedWordNetResource
	 */
	public static final String KEY_WORDNET = "wordnet";
	@ExternalResource(key = KEY_WORDNET)
	private SharedWordNetResource wordnet;

	/**
	 *
	 * The name of the Mongo collection to read
	 *
	 * @baleen.config patterns
	 */
	public static final String KEY_PATTERN_COLLECTION = "patternCollection";
	@ConfigurationParameter(name = KEY_PATTERN_COLLECTION, defaultValue = "patterns")
	private String patternCollection;

	/**
	 * Minimum number of patterns to be considered a cluster.
	 *
	 * Ie the number of evidence points we need to start to consider an interaction.
	 *
	 * @baleen.config minPatterns 2
	 */
	public static final String KEY_MIN_PATTERNS_IN_CLUSTER = "minPatterns";
	@ConfigurationParameter(name = KEY_MIN_PATTERNS_IN_CLUSTER, defaultValue = "2")
	private int minPatternsInCluster;

	/**
	 * Minimum number of occurances of a word in a cluster before its considered potentally the
	 * interaction word.
	 *
	 * Note that this should be equal to or higher than minPatterns.
	 *
	 * @baleen.config minOccurances 2
	 */
	public static final String KEY_MIN_OCCURANCE = "minOccurances";
	@ConfigurationParameter(name = KEY_MIN_OCCURANCE, defaultValue = "2")
	private int minWordOccurance;

	/**
	 * The similarity threshold between two patterns (before they are consider the same). (High is
	 * more similar)
	 *
	 * @baleen.config patterns 0.2
	 */
	public static final String KEY_THRESHOLD = "threshold";
	@ConfigurationParameter(name = KEY_THRESHOLD, defaultValue = "0.2")
	private double threshold;

	/**
	 * Log the information on completion
	 *
	 * @baleen.config patterns false
	 */
	public static final String KEY_OUTPUT = "log";
	@ConfigurationParameter(name = KEY_OUTPUT, defaultValue = "false")
	private boolean outputToLog;

	/**
	 * Save the data to csv, with filename prefixed by tje value.
	 *
	 * Leave this blank for no output.
	 *
	 * @baleen.config csv interactions-
	 */
	public static final String KEY_CSV_FILENAME = "filename";
	@ConfigurationParameter(name = KEY_CSV_FILENAME, defaultValue = "interactions.csv")
	private String csvFilename;

	private final List<InteractionWriter> interactionWriters = new ArrayList<>();

	@Override
	public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
		super.doInitialize(aContext);

		if (outputToLog) {
			interactionWriters.add(new MonitorInteractionWriter(getMonitor()));
		}

		if (!Strings.isNullOrEmpty(csvFilename)) {
			interactionWriters.add(new CsvInteractionWriter(csvFilename));
		}
	}

	@Override
	protected void execute(JobSettings settings) throws AnalysisEngineProcessException {
		final InteractionIdentifier identifier = new InteractionIdentifier(getMonitor(), minPatternsInCluster,
				minWordOccurance,
				threshold);
		getMonitor().info("Loading patterns from Mongo");
		final List<PatternReference> patterns = readPatternsFromMongo();
		getMonitor().info("Found {} patterns", patterns.size());
		getMonitor().info("Extracting interaction words...");
		final Stream<InteractionWord> words = identifier.process(patterns);
		getMonitor().info("Writing interaction words...");
		write(words);
		getMonitor().info("Interaction identification complete");

	}

	/**
	 * Read patterns from mongo.
	 *
	 * @return the list
	 */
	private List<PatternReference> readPatternsFromMongo() {
		// TODO: Ideally this would do something in a more streaming manner, as there are likely to
		// be lots of examples. Loading all patterns into memory might be prohibitive.

		final MongoCollection<Document> collection = mongo.getDB().getCollection(patternCollection);

		final List<PatternReference> patterns = new ArrayList<>((int) collection.count());

		for(Document doc : collection.find()){
			@SuppressWarnings("unchecked")
			List<Document> list = doc.get("words", List.class);
			
			final List<Word> tokens = list.stream().map(l -> {
				final String pos = l.getString("pos");
				String lemma = l.getString("lemma");

				// Fall back to actual text if no lemma
				if (lemma == null) {
					lemma = l.getString("text");
				}

				return new Word(lemma.trim().toLowerCase(), WordNetUtils.toPos(pos));
			}).filter(w -> w.getPos() != null)
					.collect(Collectors.toList());

			final PatternReference pattern = new PatternReference(doc.get("_id").toString(), tokens);
			pattern.setSourceType(((Document) doc.get("source")).getString("type"));
			pattern.setTargetType(((Document) doc.get("target")).getString("type"));
			patterns.add(pattern);
		}

		return patterns;

	}

	/**
	 * Write/save patterns to the writers.
	 *
	 * @param words
	 *            the words
	 */
	private void write(Stream<InteractionWord> words) {

		interactionWriters.forEach(w -> {
			try {
				w.initialise();
			} catch (final IOException e) {
				getMonitor().error("Unable to initialise writer", e);
			}
		});

		words.flatMap(interaction -> {
			final String lemma = interaction.getWord().getLemma();

			final String relationshipType = wordnet.getBestSuperSense(interaction.getWord().getPos(), lemma)
					.orElse(lemma);

			return interaction.toRelations(relationshipType, lemma);
		})
				.distinct()
				.forEach(r -> interactionWriters.forEach(w -> {
					try {
						w.write(r);
					} catch (final IOException e) {
						getMonitor().warn("Unable to initialise writer", e);
					}
				}));

		interactionWriters.forEach(w -> w.destroy());
	}

}