MixDisambiguation.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.truecaser;

import java.util.Map;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileReader;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.Generics;

/**
 * This utility takes the tokens in a data file and picks the most
 * common casing of words.  It then outputs the most common case for
 * each word.
 *
 * @author Michel Galley
 */
public class MixDisambiguation {

  private static Map<String, Counter<String>> map = Generics.newHashMap();
  private static Map<String, String> highest = Generics.newHashMap();

  private MixDisambiguation() {} // static class

  public static void main(String[] args) throws IOException {
    boolean outputLowercase = true;
    for (String arg : args) {
      if (arg.equalsIgnoreCase("-noLowercase")) {
        outputLowercase = false;
        continue;
      }

      // everything else is considered a filename
      BufferedReader in = new BufferedReader(new FileReader(arg));
      for (String line; (line = in.readLine()) != null; ) {
        String[] toks = line.split(" ");
        for (String tok : toks) {
          String lctok = tok.toLowerCase();
          Counter<String> counter = map.get(lctok);
          if (counter == null) {
            counter = new ClassicCounter<>();
            map.put(lctok, counter);
          }
          counter.incrementCount(tok);
        }
      }
    }

    for (String k : map.keySet()) {
      Counter<String> counter = map.get(k);
      String maxstr = "";
      int maxcount = -1;
      for(String str : counter.keySet()) {
        int count = (int)counter.getCount(str);
        if (count > maxcount) {
          maxstr = str;
          maxcount = count;
        }
      }
      highest.put(k, maxstr);
    }

    for (String k : highest.keySet()) {
      String cased = highest.get(k);
      if (!outputLowercase && k.equals(cased)) {
        continue;
      }
      System.out.printf("%s\t%s\n", k, cased);
    }
  }
}