SharedGenderMultiplicityResource.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.resources;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;

import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;

import com.google.common.base.Splitter;

import uk.gov.dstl.baleen.resources.data.Gender;
import uk.gov.dstl.baleen.resources.data.Multiplicity;
import uk.gov.dstl.baleen.uima.BaleenResource;

/**
 * Resource for gender multiplicities
 * <p>
 * Due to the nature of the data the gender should be of reasonably high quality, but the
 * multiplicity is poor. This is because there is no singular mention of counts in the data, so its
 * impossible to understand how many times relatively a word is used a singular vs plural.
 * <p>
 * Patterns with numbers are ignored.
 * <p>
 * The implementation stores three maps (per gender / multiplicity) one for exact matches, one for
 * starts with and one for endsWith. To improve performance endsWith contains the reversed text so
 * it can be matched backwards.
 *
 * This data was originally from http://www.clsp.jhu.edu/~sbergsma/Gender/ and the data is licenced
 * under the Creative Commons Unported License. See the associated paper: Shane Bergsma and Dekang
 * Lin, Bootstrapping Path-Based Pronoun Resolution, In Proceedings of the Conference on
 * Computational Lingustics / Association for Computational Linguistics (COLING/ACL-06), Sydney,
 * Australia, July 17-21, 2006.
 *
 */
public class SharedGenderMultiplicityResource extends BaleenResource {
	// TODO: I'm not sure if the exact match is correct - should we look for an exact match or
	// should this be put into both start and end?
	// Perhaps the whole thing would be better as contains but that would be very slow. Better to
	// return UNKNOWN than guess

	private static final Splitter LINE_SPLITTER = Splitter.on(" ");
	private static final Splitter WORD_SPLITTER = Splitter.on(" ").trimResults().omitEmptyStrings();

	// TODO: These values are just guesses
	private static final int GENDER_SAMPLE_THRESHOLD = 20;
	private static final int PLURAL_THRESHOLD = 200;

	private final Map<String, Multiplicity> exactMultiplicity = new HashMap<>();
	private final Map<String, Multiplicity> endsWithMultiplicity = new HashMap<>();
	private final Map<String, Multiplicity> startsWithMultiplicity = new HashMap<>();

	private final Map<String, Gender> exactGender = new HashMap<>();
	private final Map<String, Gender> endsWithGender = new HashMap<>();
	private final Map<String, Gender> startsWithGender = new HashMap<>();

	@Override
	protected boolean doInitialize(ResourceSpecifier specifier, Map<String, Object> additionalParams)
			throws ResourceInitializationException {

		Arrays.asList("gender.aa.gz", "gender.ab.gz", "gender.ac.gz", "gender.ad.gz", "gender.ae.gz", "gender.af.gz")
				.stream()
				.flatMap(f -> {
					try (BufferedReader reader = new BufferedReader(
							new InputStreamReader(new GZIPInputStream(getClass().getResourceAsStream("gender/"+f)),
									StandardCharsets.UTF_8))) {
						// Crazy, but if we return then the inputstream gets closed so the lines()
						// stream fails.
						return reader.lines()
								.collect(Collectors.toList())
								.stream();
					} catch (final Exception e) {
						getMonitor().warn("Unable to load from gender file", e);
						return Stream.empty();
					}

				})
				.filter(s -> s.contains("\t"))
				// TODO; Currently ignore any of the numerical stuff its too tedious to work with
				.filter(s -> !s.contains("#"))
				.forEach(s -> loadFromGenderRow(s));

		
		return super.doInitialize(specifier, additionalParams);
	}

	private void loadFromGenderRow(String s) {
		try {
			final String[] line = s.split("\t", 2);
			final String np = line[0].trim().toLowerCase();
			final Iterable<String> counts = LINE_SPLITTER.split(line[1]);
			final Iterator<String> iterator = counts.iterator();

			final int m = Integer.parseInt(iterator.next());
			final int f = Integer.parseInt(iterator.next());
			final int n = Integer.parseInt(iterator.next());
			final int p = Integer.parseInt(iterator.next());

			final int genderTotal = m + f + n;

			if (genderTotal > GENDER_SAMPLE_THRESHOLD) {

				if (m > 2 * Math.max(f, n)) {
					saveGender(np, Gender.M);
				} else if (f > 2 * Math.max(m, n)) {
					saveGender(np, Gender.F);
				} else if (n > 2 * Math.max(m, f)) {
					saveGender(np, Gender.N);
				}
			}

			if (p > PLURAL_THRESHOLD) {
				// TODO: Since we don't have a singular count I guess we just have a
				// threshold here? I can't see how you compare to the m/f/n words

				saveMultiplicity(np, Multiplicity.PLURAL);

			} else if (genderTotal > GENDER_SAMPLE_THRESHOLD) {
				// If we've seen it a lot otherwise we assume it must be singular
				saveMultiplicity(np, Multiplicity.SINGULAR);
			}

		} catch (final Exception e) {
			getMonitor().warn("Unable to parse line {}", s, e);
		}
	}

	private void saveMultiplicity(String np, Multiplicity multiplicity) {
		final String key = np.replaceAll("!", "").trim();
		if (np.startsWith("!")) {
			endsWithMultiplicity.put(reverse(key), multiplicity);
		} else if (np.endsWith("!")) {
			startsWithMultiplicity.put(key, multiplicity);
		} else {
			exactMultiplicity.put(key, multiplicity);
		}
	}

	private void saveGender(String np, Gender gender) {
		final String key = np.replaceAll("!", "").trim();
		
		if (np.startsWith("!")) {
			endsWithGender.put(reverse(key), gender);
		} else if (np.endsWith("!")) {
			startsWithGender.put(key, gender);
		} else {
			exactGender.put(key, gender);
		}
	}

	/**
	 * Return the gender for a given string, if it is known, or UNKNOWN otherwise
	 */
	public Gender lookupGender(String text) {
		return lookup(exactGender, startsWithGender, endsWithGender, text, Gender.UNKNOWN);
	}

	/**
	 * Return the multiplicity for a given string, if it is known, or UNKNOWN otherwise
	 */
	public Multiplicity lookupMultiplicity(String text) {
		return lookup(exactMultiplicity, startsWithMultiplicity, endsWithMultiplicity, text, Multiplicity.UNKNOWN);
	}

	private <T> T lookup(Map<String, T> exact, Map<String, T> startsWith, Map<String, T> endsWith, String inputText,
			T defaultValue) {
		String text = inputText.toLowerCase();

		// Try an exact match
		T t = exact.get(text);
		if (t != null) {
			return t;
		}

		final List<String> words = WORD_SPLITTER.splitToList(text);

		// Try start
		t = lookup(startsWith, words);
		if (t != null) {
			return t;
		}

		// Try endWith
		final List<String> reversed = new ArrayList<>(words.size());
		for (int i = words.size() - 1; i > 0; i--) {
			final String word = reverse(words.get(i));
			reversed.add(word);
		}

		t = lookup(endsWith, reversed);
		if (t != null) {
			return t;
		}

		return defaultValue;
	}

	private <T> T lookup(Map<String, T> map, List<String> words) {
		for (int i = words.size() - 1; i >= 0; i--) {
			final String s = words.stream().skip(i).collect(Collectors.joining(" "));
			final T t = map.get(s);
			if (t != null) {
				return t;
			}
		}

		return null;
	}

	private String reverse(String s) {
		return new StringBuilder(s).reverse().toString();
	}
}