//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.jobs.interactions; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.resource.ResourceInitializationException; import org.bson.Document; import com.google.common.base.Strings; import com.mongodb.client.MongoCollection; import uk.gov.dstl.baleen.jobs.interactions.data.InteractionWord; import uk.gov.dstl.baleen.jobs.interactions.data.PatternReference; import uk.gov.dstl.baleen.jobs.interactions.data.Word; import uk.gov.dstl.baleen.jobs.interactions.impl.InteractionIdentifier; import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionWriter; import uk.gov.dstl.baleen.jobs.interactions.io.InteractionWriter; import uk.gov.dstl.baleen.jobs.interactions.io.MonitorInteractionWriter; import uk.gov.dstl.baleen.resources.SharedMongoResource; import uk.gov.dstl.baleen.resources.SharedWordNetResource; import uk.gov.dstl.baleen.resources.utils.WordNetUtils; import uk.gov.dstl.baleen.uima.BaleenTask; import uk.gov.dstl.baleen.uima.JobSettings; /** * Identify interaction words based on a Mongo collection of patterns. * <p> * This requires a Wordnet dictionary and a Mongo resource (to read from). The Mongo collection * should hold patterns which have been extracted by a pipeline containing {@Link MongoPatternSaver} * <p> * See {@link InteractionIdentifier} for more details of the implementation. * <p> * The relationship types are based on Wordnet supersenses (meaning the original file in which the * word is defined). This provides a group of around 40 definitions. * <p> * The output of this process is a CSV (format defined by {@link CsvInteractionWriter}. * * <pre> * mongo: * db: baleen * host: localhost * * job: * tasks: * - class: interactions.IdentifyInteractions * filename: output/interactions.csv * </pre> * * Typically you will want to edit / review the CSV file, then run {@link EnhanceInteractions} and * then {@link UploadInteractionsToMongo}. * * @baleen.javadoc */ public class IdentifyInteractions extends BaleenTask { /** * Connection to Mongo * * @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource */ public static final String KEY_MONGO = "mongo"; @ExternalResource(key = KEY_MONGO) private SharedMongoResource mongo; /** * Connection to Wordnet * * @baleen.resource uk.gov.dstl.baleen.resources.SharedWordNetResource */ public static final String KEY_WORDNET = "wordnet"; @ExternalResource(key = KEY_WORDNET) private SharedWordNetResource wordnet; /** * * The name of the Mongo collection to read * * @baleen.config patterns */ public static final String KEY_PATTERN_COLLECTION = "patternCollection"; @ConfigurationParameter(name = KEY_PATTERN_COLLECTION, defaultValue = "patterns") private String patternCollection; /** * Minimum number of patterns to be considered a cluster. * * Ie the number of evidence points we need to start to consider an interaction. * * @baleen.config minPatterns 2 */ public static final String KEY_MIN_PATTERNS_IN_CLUSTER = "minPatterns"; @ConfigurationParameter(name = KEY_MIN_PATTERNS_IN_CLUSTER, defaultValue = "2") private int minPatternsInCluster; /** * Minimum number of occurances of a word in a cluster before its considered potentally the * interaction word. * * Note that this should be equal to or higher than minPatterns. * * @baleen.config minOccurances 2 */ public static final String KEY_MIN_OCCURANCE = "minOccurances"; @ConfigurationParameter(name = KEY_MIN_OCCURANCE, defaultValue = "2") private int minWordOccurance; /** * The similarity threshold between two patterns (before they are consider the same). (High is * more similar) * * @baleen.config patterns 0.2 */ public static final String KEY_THRESHOLD = "threshold"; @ConfigurationParameter(name = KEY_THRESHOLD, defaultValue = "0.2") private double threshold; /** * Log the information on completion * * @baleen.config patterns false */ public static final String KEY_OUTPUT = "log"; @ConfigurationParameter(name = KEY_OUTPUT, defaultValue = "false") private boolean outputToLog; /** * Save the data to csv, with filename prefixed by tje value. * * Leave this blank for no output. * * @baleen.config csv interactions- */ public static final String KEY_CSV_FILENAME = "filename"; @ConfigurationParameter(name = KEY_CSV_FILENAME, defaultValue = "interactions.csv") private String csvFilename; private final List<InteractionWriter> interactionWriters = new ArrayList<>(); @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); if (outputToLog) { interactionWriters.add(new MonitorInteractionWriter(getMonitor())); } if (!Strings.isNullOrEmpty(csvFilename)) { interactionWriters.add(new CsvInteractionWriter(csvFilename)); } } @Override protected void execute(JobSettings settings) throws AnalysisEngineProcessException { final InteractionIdentifier identifier = new InteractionIdentifier(getMonitor(), minPatternsInCluster, minWordOccurance, threshold); getMonitor().info("Loading patterns from Mongo"); final List<PatternReference> patterns = readPatternsFromMongo(); getMonitor().info("Found {} patterns", patterns.size()); getMonitor().info("Extracting interaction words..."); final Stream<InteractionWord> words = identifier.process(patterns); getMonitor().info("Writing interaction words..."); write(words); getMonitor().info("Interaction identification complete"); } /** * Read patterns from mongo. * * @return the list */ private List<PatternReference> readPatternsFromMongo() { // TODO: Ideally this would do something in a more streaming manner, as there are likely to // be lots of examples. Loading all patterns into memory might be prohibitive. final MongoCollection<Document> collection = mongo.getDB().getCollection(patternCollection); final List<PatternReference> patterns = new ArrayList<>((int) collection.count()); for(Document doc : collection.find()){ @SuppressWarnings("unchecked") List<Document> list = doc.get("words", List.class); final List<Word> tokens = list.stream().map(l -> { final String pos = l.getString("pos"); String lemma = l.getString("lemma"); // Fall back to actual text if no lemma if (lemma == null) { lemma = l.getString("text"); } return new Word(lemma.trim().toLowerCase(), WordNetUtils.toPos(pos)); }).filter(w -> w.getPos() != null) .collect(Collectors.toList()); final PatternReference pattern = new PatternReference(doc.get("_id").toString(), tokens); pattern.setSourceType(((Document) doc.get("source")).getString("type")); pattern.setTargetType(((Document) doc.get("target")).getString("type")); patterns.add(pattern); } return patterns; } /** * Write/save patterns to the writers. * * @param words * the words */ private void write(Stream<InteractionWord> words) { interactionWriters.forEach(w -> { try { w.initialise(); } catch (final IOException e) { getMonitor().error("Unable to initialise writer", e); } }); words.flatMap(interaction -> { final String lemma = interaction.getWord().getLemma(); final String relationshipType = wordnet.getBestSuperSense(interaction.getWord().getPos(), lemma) .orElse(lemma); return interaction.toRelations(relationshipType, lemma); }) .distinct() .forEach(r -> interactionWriters.forEach(w -> { try { w.write(r); } catch (final IOException e) { getMonitor().warn("Unable to initialise writer", e); } })); interactionWriters.forEach(w -> w.destroy()); } }