package edu.stanford.nlp.dcoref.util; import java.io.BufferedReader; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import edu.stanford.nlp.dcoref.Dictionaries.Gender; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.util.Generics; /** * This tool converts the gender file from the following: * <br> * w1 w2... TAB male female neutral <br> * etc <br> * <br> * into a serialized data structure which should take much less time to load. * * @author John Bauer */ public class ConvertGenderFile { private ConvertGenderFile() {} // static class public static void main(String[] args) throws IOException { String input = null; String output = null; for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { input = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-output")) { output = args[argIndex + 1]; argIndex += 2; } else { throw new IllegalArgumentException("Unknown argument " + args[argIndex]); } } if (input == null) { throw new IllegalArgumentException("Must specify input with -input"); } if (output == null) { throw new IllegalArgumentException("Must specify output with -output"); } Map<List<String>, Gender> genderNumber = Generics.newHashMap(); BufferedReader reader = IOUtils.readerFromString(input); for (String line; (line = reader.readLine()) != null; ) { String[] split = line.split("\t"); String[] countStr = split[1].split(" "); int male = Integer.parseInt(countStr[0]); int female = Integer.parseInt(countStr[1]); int neutral = Integer.parseInt(countStr[2]); Gender gender = Gender.UNKNOWN; if (male * 0.5 > female + neutral && male > 2) { gender = Gender.MALE; } else if (female * 0.5 > male + neutral && female > 2) { gender = Gender.FEMALE; } else if (neutral * 0.5 > male + female && neutral > 2) { gender = Gender.NEUTRAL; } if (gender == Gender.UNKNOWN) { continue; } String[] words = split[0].split(" "); List<String> tokens = Arrays.asList(words); genderNumber.put(tokens, gender); } IOUtils.writeObjectToFile(genderNumber, output); } }