/** * Phuong LE HONG, phuonglh@gmail.com */ package vn.hus.nlp.tagger.util; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import vn.hus.nlp.tagger.IConstants; import edu.stanford.nlp.ling.CategoryWordTag; import edu.stanford.nlp.trees.DiskTreebank; import edu.stanford.nlp.trees.PennTreeReaderFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeVisitor; import edu.stanford.nlp.trees.Treebank; import edu.stanford.nlp.trees.Trees; /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Oct 7, 2009, 4:44:01 PM * <p> * This utility is used for converting parse sentences to tagged sentences. */ public class TreeToTaggedSentence { private List<String> taggedSentences = new ArrayList<String>(); /** * Collects tagged sentences from a treebank. * @param treebankFilename a treebank */ public void collectSentences(String treebankFilename) { // create the treebank object // using the Vietnamese tree reader // Treebank treebank = new DiskTreebank(new VietnameseTreeReaderFactory()); // use the Penn tree reader for collecting punctuations since // the Vietnamese tree reader strips them out. Treebank treebank = new DiskTreebank(new PennTreeReaderFactory()); CategoryWordTag.suppressTerminalDetails = true; // load the treebank treebank.loadPath(treebankFilename); // create a height collector TagCollector categoryCollector = this.new TagCollector(); // collect the categories treebank.apply(categoryCollector); } /** * Prints the tagged sentences to a writer. * @param pw a print writer. */ public void printTaggedSentences(PrintWriter pw) { for (String s : taggedSentences) { pw.append(s); pw.append("\n"); } } /** * @param args */ public static void main(String[] args) { TreeToTaggedSentence ctd = new TreeToTaggedSentence(); System.out.println("Collecting tagged sentences..."); ctd.collectSentences(IConstants.TREEBANK); PrintWriter pw; try { pw = new PrintWriter(new File(IConstants.CORPUS)); ctd.printTaggedSentences(pw); pw.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } System.out.println("Done."); } /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Oct 7, 2009, 4:48:53 PM * <p> * Tagged sentences collector. */ class TagCollector implements TreeVisitor { /* (non-Javadoc) * @see edu.stanford.nlp.trees.TreeVisitor#visitTree(edu.stanford.nlp.trees.Tree) */ public void visitTree(Tree t) { // get a list of tags of t List<Tree> tags = Trees.preTerminals(t); StringBuffer buffer = new StringBuffer(512); for (Tree node : tags) { String tag = node.label().toString(); if (tag.equals("-NONE-")) { tag = ""; } tag = basicCategory(tag); if (tag.length() > 0) { String word = node.children()[0].label().toString(); // convert slash to back slash if (tag.equals(IConstants.DEFAULT_MODEL_FILE)) { tag = "SLASH"; word = "SLASH"; } buffer.append(word); buffer.append(IConstants.DELIM); buffer.append(tag); buffer.append(" "); } } taggedSentences.add(buffer.toString().trim()); } private String basicCategory(String string) { int index = string.indexOf('-'); if (index > 0) { return string.substring(0, index); } return string; } } }