SharedWordNetResource.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.resources;

import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;

import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;

import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.IndexWord;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.dictionary.Dictionary;
import uk.gov.dstl.baleen.uima.BaleenResource;

/**
 * A shared resource that provides access to WordNet.
 *
 * @baleen.javadoc
 */
public class SharedWordNetResource extends BaleenResource {
	private Dictionary dictionary;

	@Override
	protected boolean doInitialize(final ResourceSpecifier specifier, final Map<String, Object> additionalParams)
			throws ResourceInitializationException {

		try {
			dictionary = Dictionary.getDefaultResourceInstance();
		} catch (final JWNLException e) {
			throw new ResourceInitializationException(e);
		}

		return super.doInitialize(specifier, additionalParams);
	}

	@Override
	protected void doDestroy() {
		super.doDestroy();

		try {
			dictionary.close();
		} catch (final JWNLException e) {
			getLogger().warn("WordNet dictionary did not close cleanly", e);
		} finally {
			dictionary = null;
		}

	}

	/**
	 * Get the WordNet dictionary.
	 *
	 * @return Wordnet dictionary
	 */
	public Dictionary getDictionary() {
		return dictionary;
	}

	/**
	 * Lookup the word from the dictionary, performing lemmisation if required.
	 *
	 * @param pos
	 *            the pos
	 * @param word
	 *            the word
	 * @return the WordNet word, (as an optional)
	 */
	public Optional<IndexWord> lookupWord(final POS pos, final String word) {
		try {
			return Optional.ofNullable(dictionary.lookupIndexWord(pos, word));
		} catch (final JWNLException e) {
			getMonitor().warn("Lookup word failed", e);
			return Optional.empty();
		}
	}

	/**
	 * Get an exact lemma from the dictionary, .
	 *
	 * @param pos
	 *            the pos
	 * @param lemma
	 *            the lemma
	 * @return the WordNet word (as an optional)
	 */
	public Optional<IndexWord> getWord(final POS pos, final String lemma) {
		try {
			return Optional.ofNullable(dictionary.getIndexWord(pos, lemma));
		} catch (final JWNLException e) {
			getMonitor().warn("Get word failed", e);
			return Optional.empty();
		}
	}

	/**
	 * Gets the super senses of a word.
	 *
	 * The supersense is the original 'sense file' in which word was defined.
	 *
	 * @param pos
	 *            the pos
	 * @param word
	 *            the word
	 * @return the super senses
	 */
	public Stream<String> getSuperSenses(POS pos, String word) {
		final Optional<IndexWord> indexWord = lookupWord(pos, word);

		if (!indexWord.isPresent()) {
			return Stream.empty();
		} else {
			// NOTE: This was stream but it WordNet getSenses() somehow seems incompatible with
			// streams
			final List<Synset> senses = indexWord.get().getSenses();
			final Set<String> set = new HashSet<>();
			for (final Synset s : senses) {
				set.add(stripPOSFromSupersense(s.getLexFileName()));
			}
			return set.stream();
		}
	}

	/**
	 * Gets the best super sense for a word.
	 *
	 * @param pos
	 *            the pos
	 * @param word
	 *            the word
	 * @return the best super sense
	 */
	public Optional<String> getBestSuperSense(POS pos, String word) {
		final Optional<IndexWord> indexWord = lookupWord(pos, word);

		if (!indexWord.isPresent()) {
			return Optional.empty();
		} else {
			final List<Synset> senses = indexWord.get().getSenses();
			if (senses.isEmpty()) {
				return Optional.empty();
			} else {
				// At this stage we could do something clever, look at the gloss to see is there are
				// word overlaps
				// but we opt for a more predicatable concept of selecting the most commonly used
				// meaning sense.

				return Optional.of(stripPOSFromSupersense(senses.get(0).getLexFileName()));
			}
		}
	}

	/**
	 * Strip POS from supersense.
	 *
	 * Since the lexfile has filename "noun.cogition" we remove the "noun." so that verb based and
	 * noun based words of the same supersense have the same value.
	 *
	 * @param sense
	 *            the sense
	 * @return the string
	 */
	private String stripPOSFromSupersense(String sense) {
		final int index = sense.indexOf(".") + 1;
		if (index != 0) {
			return sense.substring(index);
		} else {
			return sense;
		}
	}

}