/** * Phuong LE HONG, phuonglh@gmail.com */ package vn.hus.nlp.tagger.util; import java.util.Set; import java.util.TreeSet; import vn.hus.nlp.tagger.IConstants; import edu.stanford.nlp.ling.CategoryWordTag; import edu.stanford.nlp.trees.DiskTreebank; import edu.stanford.nlp.trees.PennTreeReaderFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeVisitor; import edu.stanford.nlp.trees.Treebank; /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Nov 26, 2009, 5:59:23 PM * <p> * This utility collects all the words that has Nc tags (classifiers) in the treebank. * */ public class CollectionNc { private static final String targetTag = "Nc"; private Set<String> set = new TreeSet<String>(); /** * Collects all the words which are tagged as Nc. * @param treebankFilename */ public void collect(String treebankFilename) { // create the treebank object // using the Vietnamese tree reader // Treebank treebank = new DiskTreebank(new VietnameseTreeReaderFactory()); // use the Penn tree reader for collecting punctuations since // the Vietnamese tree reader strips them out. Treebank treebank = new DiskTreebank(new PennTreeReaderFactory()); CategoryWordTag.suppressTerminalDetails = true; // load the treebank treebank.loadPath(treebankFilename); // create a height collector TagCollectorNc categoryCollector = this.new TagCollectorNc(); // collect the categories treebank.apply(categoryCollector); } public void print() { StringBuffer buffer = new StringBuffer(1024); StringBuffer line = new StringBuffer(1024); int i = 0; for (String w : set) { line.append("\"" + w + "\", "); i++; if (i % 12 == 0) { buffer.append(line); buffer.append("\n"); line = new StringBuffer(1024); } } System.out.println(buffer.toString()); System.out.println(); System.out.println("There are " + set.size() + " words."); } /** * @param args */ public static void main(String[] args) { CollectionNc ctd = new CollectionNc(); System.out.println("Collecting words..."); ctd.collect(IConstants.TREEBANK); ctd.print(); } /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Nov 26, 2009, 6:02:59 PM * <p> */ class TagCollectorNc implements TreeVisitor { /* (non-Javadoc) * @see edu.stanford.nlp.trees.TreeVisitor#visitTree(edu.stanford.nlp.trees.Tree) */ public void visitTree(Tree t) { for (Tree node : t) { String tag = null; if (node.isPreTerminal()) { String word = node.children()[0].label().toString(); tag = basicCategory(node.label().toString()); if (tag.equals(targetTag) && word.indexOf('_') < 0) { set.add(word.toLowerCase()); } } } } private String basicCategory(String string) { int index = string.indexOf('-'); if (index > 0) { return string.substring(0, index); } return string; } } }