//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.resources; import java.io.BufferedReader; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import com.google.common.base.Splitter; import uk.gov.dstl.baleen.resources.data.Gender; import uk.gov.dstl.baleen.resources.data.Multiplicity; import uk.gov.dstl.baleen.uima.BaleenResource; /** * Resource for gender multiplicities * <p> * Due to the nature of the data the gender should be of reasonably high quality, but the * multiplicity is poor. This is because there is no singular mention of counts in the data, so its * impossible to understand how many times relatively a word is used a singular vs plural. * <p> * Patterns with numbers are ignored. * <p> * The implementation stores three maps (per gender / multiplicity) one for exact matches, one for * starts with and one for endsWith. To improve performance endsWith contains the reversed text so * it can be matched backwards. * * This data was originally from http://www.clsp.jhu.edu/~sbergsma/Gender/ and the data is licenced * under the Creative Commons Unported License. See the associated paper: Shane Bergsma and Dekang * Lin, Bootstrapping Path-Based Pronoun Resolution, In Proceedings of the Conference on * Computational Lingustics / Association for Computational Linguistics (COLING/ACL-06), Sydney, * Australia, July 17-21, 2006. * */ public class SharedGenderMultiplicityResource extends BaleenResource { // TODO: I'm not sure if the exact match is correct - should we look for an exact match or // should this be put into both start and end? // Perhaps the whole thing would be better as contains but that would be very slow. Better to // return UNKNOWN than guess private static final Splitter LINE_SPLITTER = Splitter.on(" "); private static final Splitter WORD_SPLITTER = Splitter.on(" ").trimResults().omitEmptyStrings(); // TODO: These values are just guesses private static final int GENDER_SAMPLE_THRESHOLD = 20; private static final int PLURAL_THRESHOLD = 200; private final Map<String, Multiplicity> exactMultiplicity = new HashMap<>(); private final Map<String, Multiplicity> endsWithMultiplicity = new HashMap<>(); private final Map<String, Multiplicity> startsWithMultiplicity = new HashMap<>(); private final Map<String, Gender> exactGender = new HashMap<>(); private final Map<String, Gender> endsWithGender = new HashMap<>(); private final Map<String, Gender> startsWithGender = new HashMap<>(); @Override protected boolean doInitialize(ResourceSpecifier specifier, Map<String, Object> additionalParams) throws ResourceInitializationException { Arrays.asList("gender.aa.gz", "gender.ab.gz", "gender.ac.gz", "gender.ad.gz", "gender.ae.gz", "gender.af.gz") .stream() .flatMap(f -> { try (BufferedReader reader = new BufferedReader( new InputStreamReader(new GZIPInputStream(getClass().getResourceAsStream("gender/"+f)), StandardCharsets.UTF_8))) { // Crazy, but if we return then the inputstream gets closed so the lines() // stream fails. return reader.lines() .collect(Collectors.toList()) .stream(); } catch (final Exception e) { getMonitor().warn("Unable to load from gender file", e); return Stream.empty(); } }) .filter(s -> s.contains("\t")) // TODO; Currently ignore any of the numerical stuff its too tedious to work with .filter(s -> !s.contains("#")) .forEach(s -> loadFromGenderRow(s)); return super.doInitialize(specifier, additionalParams); } private void loadFromGenderRow(String s) { try { final String[] line = s.split("\t", 2); final String np = line[0].trim().toLowerCase(); final Iterable<String> counts = LINE_SPLITTER.split(line[1]); final Iterator<String> iterator = counts.iterator(); final int m = Integer.parseInt(iterator.next()); final int f = Integer.parseInt(iterator.next()); final int n = Integer.parseInt(iterator.next()); final int p = Integer.parseInt(iterator.next()); final int genderTotal = m + f + n; if (genderTotal > GENDER_SAMPLE_THRESHOLD) { if (m > 2 * Math.max(f, n)) { saveGender(np, Gender.M); } else if (f > 2 * Math.max(m, n)) { saveGender(np, Gender.F); } else if (n > 2 * Math.max(m, f)) { saveGender(np, Gender.N); } } if (p > PLURAL_THRESHOLD) { // TODO: Since we don't have a singular count I guess we just have a // threshold here? I can't see how you compare to the m/f/n words saveMultiplicity(np, Multiplicity.PLURAL); } else if (genderTotal > GENDER_SAMPLE_THRESHOLD) { // If we've seen it a lot otherwise we assume it must be singular saveMultiplicity(np, Multiplicity.SINGULAR); } } catch (final Exception e) { getMonitor().warn("Unable to parse line {}", s, e); } } private void saveMultiplicity(String np, Multiplicity multiplicity) { final String key = np.replaceAll("!", "").trim(); if (np.startsWith("!")) { endsWithMultiplicity.put(reverse(key), multiplicity); } else if (np.endsWith("!")) { startsWithMultiplicity.put(key, multiplicity); } else { exactMultiplicity.put(key, multiplicity); } } private void saveGender(String np, Gender gender) { final String key = np.replaceAll("!", "").trim(); if (np.startsWith("!")) { endsWithGender.put(reverse(key), gender); } else if (np.endsWith("!")) { startsWithGender.put(key, gender); } else { exactGender.put(key, gender); } } /** * Return the gender for a given string, if it is known, or UNKNOWN otherwise */ public Gender lookupGender(String text) { return lookup(exactGender, startsWithGender, endsWithGender, text, Gender.UNKNOWN); } /** * Return the multiplicity for a given string, if it is known, or UNKNOWN otherwise */ public Multiplicity lookupMultiplicity(String text) { return lookup(exactMultiplicity, startsWithMultiplicity, endsWithMultiplicity, text, Multiplicity.UNKNOWN); } private <T> T lookup(Map<String, T> exact, Map<String, T> startsWith, Map<String, T> endsWith, String inputText, T defaultValue) { String text = inputText.toLowerCase(); // Try an exact match T t = exact.get(text); if (t != null) { return t; } final List<String> words = WORD_SPLITTER.splitToList(text); // Try start t = lookup(startsWith, words); if (t != null) { return t; } // Try endWith final List<String> reversed = new ArrayList<>(words.size()); for (int i = words.size() - 1; i > 0; i--) { final String word = reverse(words.get(i)); reversed.add(word); } t = lookup(endsWith, reversed); if (t != null) { return t; } return defaultValue; } private <T> T lookup(Map<String, T> map, List<String> words) { for (int i = words.size() - 1; i >= 0; i--) { final String s = words.stream().skip(i).collect(Collectors.joining(" ")); final T t = map.get(s); if (t != null) { return t; } } return null; } private String reverse(String s) { return new StringBuilder(s).reverse().toString(); } }