//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.resources; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Stream; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import net.sf.extjwnl.JWNLException; import net.sf.extjwnl.data.IndexWord; import net.sf.extjwnl.data.POS; import net.sf.extjwnl.data.Synset; import net.sf.extjwnl.dictionary.Dictionary; import uk.gov.dstl.baleen.uima.BaleenResource; /** * A shared resource that provides access to WordNet. * * @baleen.javadoc */ public class SharedWordNetResource extends BaleenResource { private Dictionary dictionary; @Override protected boolean doInitialize(final ResourceSpecifier specifier, final Map<String, Object> additionalParams) throws ResourceInitializationException { try { dictionary = Dictionary.getDefaultResourceInstance(); } catch (final JWNLException e) { throw new ResourceInitializationException(e); } return super.doInitialize(specifier, additionalParams); } @Override protected void doDestroy() { super.doDestroy(); try { dictionary.close(); } catch (final JWNLException e) { getLogger().warn("WordNet dictionary did not close cleanly", e); } finally { dictionary = null; } } /** * Get the WordNet dictionary. * * @return Wordnet dictionary */ public Dictionary getDictionary() { return dictionary; } /** * Lookup the word from the dictionary, performing lemmisation if required. * * @param pos * the pos * @param word * the word * @return the WordNet word, (as an optional) */ public Optional<IndexWord> lookupWord(final POS pos, final String word) { try { return Optional.ofNullable(dictionary.lookupIndexWord(pos, word)); } catch (final JWNLException e) { getMonitor().warn("Lookup word failed", e); return Optional.empty(); } } /** * Get an exact lemma from the dictionary, . * * @param pos * the pos * @param lemma * the lemma * @return the WordNet word (as an optional) */ public Optional<IndexWord> getWord(final POS pos, final String lemma) { try { return Optional.ofNullable(dictionary.getIndexWord(pos, lemma)); } catch (final JWNLException e) { getMonitor().warn("Get word failed", e); return Optional.empty(); } } /** * Gets the super senses of a word. * * The supersense is the original 'sense file' in which word was defined. * * @param pos * the pos * @param word * the word * @return the super senses */ public Stream<String> getSuperSenses(POS pos, String word) { final Optional<IndexWord> indexWord = lookupWord(pos, word); if (!indexWord.isPresent()) { return Stream.empty(); } else { // NOTE: This was stream but it WordNet getSenses() somehow seems incompatible with // streams final List<Synset> senses = indexWord.get().getSenses(); final Set<String> set = new HashSet<>(); for (final Synset s : senses) { set.add(stripPOSFromSupersense(s.getLexFileName())); } return set.stream(); } } /** * Gets the best super sense for a word. * * @param pos * the pos * @param word * the word * @return the best super sense */ public Optional<String> getBestSuperSense(POS pos, String word) { final Optional<IndexWord> indexWord = lookupWord(pos, word); if (!indexWord.isPresent()) { return Optional.empty(); } else { final List<Synset> senses = indexWord.get().getSenses(); if (senses.isEmpty()) { return Optional.empty(); } else { // At this stage we could do something clever, look at the gloss to see is there are // word overlaps // but we opt for a more predicatable concept of selecting the most commonly used // meaning sense. return Optional.of(stripPOSFromSupersense(senses.get(0).getLexFileName())); } } } /** * Strip POS from supersense. * * Since the lexfile has filename "noun.cogition" we remove the "noun." so that verb based and * noun based words of the same supersense have the same value. * * @param sense * the sense * @return the string */ private String stripPOSFromSupersense(String sense) { final int index = sense.indexOf(".") + 1; if (index != 0) { return sense.substring(index); } else { return sense; } } }