package edu.stanford.nlp.truecaser; import java.util.Map; import java.io.BufferedReader; import java.io.IOException; import java.io.FileReader; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Generics; /** * This utility takes the tokens in a data file and picks the most * common casing of words. It then outputs the most common case for * each word. * * @author Michel Galley */ public class MixDisambiguation { private static Map<String, Counter<String>> map = Generics.newHashMap(); private static Map<String, String> highest = Generics.newHashMap(); private MixDisambiguation() {} // static class public static void main(String[] args) throws IOException { boolean outputLowercase = true; for (String arg : args) { if (arg.equalsIgnoreCase("-noLowercase")) { outputLowercase = false; continue; } // everything else is considered a filename BufferedReader in = new BufferedReader(new FileReader(arg)); for (String line; (line = in.readLine()) != null; ) { String[] toks = line.split(" "); for (String tok : toks) { String lctok = tok.toLowerCase(); Counter<String> counter = map.get(lctok); if (counter == null) { counter = new ClassicCounter<>(); map.put(lctok, counter); } counter.incrementCount(tok); } } } for (String k : map.keySet()) { Counter<String> counter = map.get(k); String maxstr = ""; int maxcount = -1; for(String str : counter.keySet()) { int count = (int)counter.getCount(str); if (count > maxcount) { maxstr = str; maxcount = count; } } highest.put(k, maxstr); } for (String k : highest.keySet()) { String cased = highest.get(k); if (!outputLowercase && k.equals(cased)) { continue; } System.out.printf("%s\t%s\n", k, cased); } } }