package vn.hus.nlp.tagger.util; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import vn.hus.nlp.utils.UTF8FileUtility; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.stats.IntCounter; /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Jan 25, 2010, 1:52:13 PM * <p> * This utility counts the distribution of tags given in the VCL dictionary. */ public class TagDistribution { private Map<String, Set<String>> map = new TreeMap<String, Set<String>>(); private final String dictionary = "data/vcl.csv"; public TagDistribution() { String[] lines = UTF8FileUtility.getLines(dictionary); int position = -1; String previousWord = null; String word = null; String tag = null; for (String line : lines) { position = line.indexOf(';'); if (position > 0) { if (word != null && !word.equals("NULL")) { previousWord = word; } word = line.substring(0, position).trim(); tag = line.substring(position+1).trim(); if (word.equals("NULL")) { // update an entry of the map map.get(previousWord).add(tag); } else { // make a new entry for the map Set<String> tags = new HashSet<String>(); tags.add(tag); map.put(word, tags); } } } } public void show() { // print out ten random entries of the map for (int i = 0; i < 10; i++) { int randIndex = (int) (Math.random() * map.size()); String word = map.keySet().toArray(new String[map.keySet().size()])[randIndex]; System.out.print(word); System.out.println(map.get(word)); } } public void statistics() { System.out.println("Number of distinct entries = " + map.keySet().size()); /* for (String word : map.keySet()) { System.out.println(word + ";" + map.get(word)); } */ // get the counter of entry and its different pos count Counter<String> counter = new IntCounter<String>(); int[] counts = new int[6]; for (String e : map.keySet()) { int s = map.get(e).size(); counts[s]++; counter.incrementCount(e, s); } System.out.println("Max POS counts = " + Counters.max(counter) + " at " + Counters.argmax(counter)); System.out.println("Mean POS counts = " + Counters.mean(counter)); for (int j = 1; j < counts.length; j++) { System.out.println(j + " : " + counts[j]); } } /** * @param args */ public static void main(String[] args) { TagDistribution td = new TagDistribution(); td.statistics(); td.show(); } }