package edu.stanford.nlp.international.arabic.pipeline; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import edu.stanford.nlp.trees.treebank.ConfigParser; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils; /** * Converts ATB gold parse trees to a format appropriate for training a POS tagger (especially * the Stanford POS tagger!). * * @author Spence Green * */ public class TaggedArabicDataset extends ATBArabicDataset { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(TaggedArabicDataset.class); private String wordTagDelim = "_"; @Override public void build() { //Set specific options for this dataset if(options.containsKey(ConfigParser.paramTagDelim)) { wordTagDelim = options.getProperty(ConfigParser.paramTagDelim); } for(File path : pathsToData) { int prevSize = treebank.size(); if(splitFilter == null) { treebank.loadPath(path,treeFileExtension,false); } else { treebank.loadPath(path,splitFilter); } toStringBuffer.append(String.format(" Loaded %d trees from %s\n", treebank.size() - prevSize, path.getPath())); prevSize = treebank.size(); } PrintWriter outfile = null; PrintWriter flatFile = null; try { outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8"))); flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null; ArabicTreeTaggedNormalizer tv = new ArabicTreeTaggedNormalizer(outfile,flatFile); treebank.apply(tv); outputFileList.add(outFileName); if(makeFlatFile) { outputFileList.add(flatFileName); } } catch (UnsupportedEncodingException e) { System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName()); e.printStackTrace(); } catch (FileNotFoundException e) { System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName); } finally { if(outfile != null) { outfile.close(); } if(flatFile != null) { flatFile.close(); } } } protected class ArabicTreeTaggedNormalizer extends ArabicRawTreeNormalizer { public ArabicTreeTaggedNormalizer(PrintWriter outFile, PrintWriter flatFile) { super(outFile,flatFile); } public void visitTree(Tree t) { if(t == null || t.value().equals("X")) return; t = t.prune(nullFilter, new LabeledScoredTreeFactory()); for(Tree node : t) { if(node.isPreTerminal()) { processPreterminal(node); } } outfile.println(ATBTreeUtils.taggedStringFromTree(t, removeEscapeTokens, wordTagDelim)); if(flatFile != null) { flatFile.println(ATBTreeUtils.flattenTree(t)); } } } }