//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.jobs.interactions; import java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import net.sf.extjwnl.data.IndexWord; import net.sf.extjwnl.data.Synset; import net.sf.extjwnl.dictionary.Dictionary; import uk.gov.dstl.baleen.jobs.interactions.data.InteractionDefinition; import uk.gov.dstl.baleen.jobs.interactions.data.Word; import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionReader; import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionWriter; import uk.gov.dstl.baleen.resources.SharedWordNetResource; import uk.gov.dstl.baleen.uima.BaleenTask; import uk.gov.dstl.baleen.uima.JobSettings; /** * Enhance and extend the list of interaction words through WordNet. * <p> * This is useful for increasing the range of a words which are considered for interaction gazetteer * matching without increasing the manual effort. It is likely that the user will want to review the * words after running this, to ensure the words truly have the same meaning that the relationship * requires. * <p> * The CSV file, see {@link CsvInteractionReader} and {@link CsvInteractionWriter} for format * information, is read. This lemma and POS are used to find additional dictionary words which have * the same meaning. * <p> * The output is saved back in the same format. * * * @baleen.javadoc * */ public class EnhanceInteractions extends BaleenTask { /** * Connection to Wordnet * * @baleen.resource uk.gov.dstl.baleen.resources.SharedWordNetResource */ public static final String KEY_WORDNET = "wordnet"; @ExternalResource(key = KEY_WORDNET) private SharedWordNetResource wordnet; /** * Save the data to csv, with filename prefixed by tje value. * * Leave this blank for no output. * * @baleen.config csv interactions.csv */ public static final String KEY_CSV_INPUT = "input"; @ConfigurationParameter(name = KEY_CSV_INPUT, defaultValue = "interactions.csv") private String inputFilename; /** * Save the data to csv, with filename prefixed by tje value. * * Leave this blank for no output. * * @baleen.config csv interactions-enhanced.csv */ public static final String KEY_CSV_OUTPUT = "output"; @ConfigurationParameter(name = KEY_CSV_OUTPUT, defaultValue = "interactions-enhanced.csv") private String outputFilename; private Dictionary dictionary; @Override protected void execute(JobSettings settings) throws AnalysisEngineProcessException { dictionary = wordnet.getDictionary(); try (CsvInteractionWriter writer = new CsvInteractionWriter(outputFilename)) { final CsvInteractionReader reader = new CsvInteractionReader(inputFilename); writer.initialise(); reader.read((i, a) -> { final Set<String> alternatives = getAlternativeWords(i.getWord()) .map(s -> s.trim().toLowerCase()) // We don't want any small words, they are too commons .filter(s -> s.length() > 2) // We don't want any phrases .filter(s -> s.indexOf(" ") == -1) .collect(Collectors.toSet()); // Add in whatever the user provided alternatives.addAll(a); writeRow(writer, i, alternatives); }); getMonitor().info("Interaction enhacement complete and written to {}", outputFilename); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } } private void writeRow(CsvInteractionWriter writer, InteractionDefinition interaction, Set<String> alternatives){ try { writer.write(interaction, alternatives); } catch (final Exception e) { getMonitor().warn("Unable to write CSV row", e); } } /** * Gets the alternative words from the dictionary. * * @param word * the word * @return the alternative words (non null and always contains the word itself) */ private Stream<String> getAlternativeWords(Word word) { IndexWord indexWord = null; try { indexWord = dictionary.lookupIndexWord(word.getPos(), word.getLemma()); } catch (final Exception e) { getMonitor().debug("Unable to find word in wordnet, defaulting to lemma form", e); } if (indexWord == null) { return Stream.of(word.getLemma()); } Set<String> set = new HashSet<String>(); set.add(word.getLemma()); for (Synset synset : indexWord.getSenses()) { for (net.sf.extjwnl.data.Word w : synset.getWords()) { set.add(w.getLemma()); } } return set.stream(); } }