package edu.berkeley.nlp.PCFGLA; import edu.berkeley.nlp.PCFGLA.GrammarTrainer.Options; import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentBits; import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentSubstate; import edu.berkeley.nlp.PCFGLA.smoothing.Smoother; import edu.berkeley.nlp.io.PTBLineLexer; import edu.berkeley.nlp.io.PTBTokenizer; import edu.berkeley.nlp.io.PTBLexer; import edu.berkeley.nlp.syntax.StateSet; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.syntax.Trees; import edu.berkeley.nlp.ui.TreeJPanel; import edu.berkeley.nlp.util.CommandLineUtils; import edu.berkeley.nlp.util.Numberer; import java.awt.AlphaComposite; import java.awt.Graphics2D; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.imageio.ImageIO; import javax.swing.JFrame; /** * Reads in the Penn Treebank and generates N_GRAMMARS different grammars. * * @author Slav Petrov */ public class BerkeleyParser { static TreeJPanel tjp; static JFrame frame; public static class Options { @Option(name = "-gr", required = true, usage = "Grammarfile (Required)\n") public String grFileName; @Option(name = "-tokenize", usage = "Tokenize input first. (Default: false=text is already tokenized)") public boolean tokenize = false; @Option(name = "-viterbi", usage = "Compute viterbi derivation instead of max-rule tree (Default: max-rule)") public boolean viterbi; @Option(name = "-binarize", usage = "Output binarized trees. (Default: false)") public boolean binarize; @Option(name = "-scores", usage = "Output inside scores (only for binarized viterbi trees). (Default: false)") public boolean scores; @Option(name = "-substates", usage = "Output subcategories (only for binarized viterbi trees). (Default: false)") public boolean substates; @Option(name = "-accurate", usage = "Set thresholds for accuracy. (Default: set thresholds for efficiency)") public boolean accurate; @Option(name = "-modelScore", usage = "Output effective model score (max rule score for max rule parser) (Default: false)") public boolean modelScore; @Option(name = "-confidence", usage = "Output confidence measure, i.e. likelihood of tree given words: P(T|w) (Default: false)") public boolean confidence; @Option(name = "-sentence_likelihood", usage = "Output sentence likelihood, i.e. summing out all parse trees: P(w) (Default: false)") public boolean sentence_likelihood; @Option(name = "-tree_likelihood", usage = "Output joint likelihood of tree and words: P(t,w) (Default: false)") public boolean tree_likelihood; @Option(name = "-variational", usage = "Use variational rule score approximation instead of max-rule (Default: false)") public boolean variational; @Option(name = "-render", usage = "Write rendered tree to image file. (Default: false)") public boolean render; @Option(name = "-chinese", usage = "Enable some Chinese specific features in the lexicon.") public boolean chinese; @Option(name = "-inputFile", usage = "Read input from this file instead of reading it from STDIN.") public String inputFile; @Option(name = "-maxLength", usage = "Maximum sentence length (Default = 200).") public int maxLength = 200; @Option(name = "-nThreads", usage = "Parse in parallel using n threads (Default: 1).") public int nThreads = 1; @Option(name = "-kbest", usage = "Output the k best parse max-rule trees (Default: 1).") public int kbest = 1; @Option(name = "-outputFile", usage = "Store output in this file instead of printing it to STDOUT.") public String outputFile; @Option(name = "-useGoldPOS", usage = "Read data in CoNLL format, including gold part of speech tags.") public boolean goldPOS; @Option(name = "-dumpPosteriors", usage = "Dump max-rule posteriors to disk.") public boolean dumpPosteriors; @Option(name = "-ec_format", usage = "Use Eugene Charniak's input and output format.") public boolean ec_format; } @SuppressWarnings("unchecked") public static void main(String[] args) { OptionParser optParser = new OptionParser(Options.class); Options opts = (Options) optParser.parse(args, true); double threshold = 1.0; String inFileName = opts.grFileName; ParserData pData = ParserData.Load(inFileName); if (pData==null) { System.out.println("Failed to load grammar from file"+inFileName+"."); System.exit(1); } Grammar grammar = pData.getGrammar(); Lexicon lexicon = pData.getLexicon(); Numberer.setNumberers(pData.getNumbs()); if (opts.chinese) Corpus.myTreebank = Corpus.TreeBankType.CHINESE; CoarseToFineMaxRuleParser parser = null; if (opts.kbest==1) parser = new CoarseToFineMaxRuleParser(grammar, lexicon, threshold,-1,opts.viterbi,opts.substates,opts.scores, opts.accurate, opts.variational, true, true); else parser = new CoarseToFineNBestParser(grammar, lexicon, opts.kbest,threshold,-1,opts.viterbi,opts.substates,opts.scores, opts.accurate, opts.variational, false, true); parser.binarization = pData.getBinarization(); if (opts.render) tjp = new TreeJPanel(); MultiThreadedParserWrapper m_parser = null; if (opts.nThreads > 1){ System.err.println("Parsing with "+opts.nThreads+" threads in parallel."); m_parser = new MultiThreadedParserWrapper(parser, opts.nThreads); } try{ BufferedReader inputData = (opts.inputFile==null) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new InputStreamReader(new FileInputStream(opts.inputFile), "UTF-8")); PrintWriter outputData = (opts.outputFile==null) ? new PrintWriter(new OutputStreamWriter(System.out)) : new PrintWriter(new OutputStreamWriter(new FileOutputStream(opts.outputFile), "UTF-8"), true); PTBLineLexer tokenizer = null; if (opts.tokenize) tokenizer = new PTBLineLexer(); String line = ""; String sentenceID = ""; while((line=inputData.readLine()) != null){ List<String> sentence = null; List<String> posTags = null; if (opts.goldPOS){ sentence = new ArrayList<String>(); posTags = new ArrayList<String>(); List<String> tmp = Arrays.asList(line.split("\t")); if (tmp.size()==0) continue; // System.out.println(line+tmp); sentence.add(tmp.get(0)); String[] tags = tmp.get(1).split("-"); posTags.add(tags[0]); while(!(line=inputData.readLine()).equals("")){ tmp = Arrays.asList(line.split("\t")); if (tmp.size()==0) break; // System.out.println(line+tmp); sentence.add(tmp.get(0)); tags = tmp.get(1).split("-"); posTags.add(tags[0]); } } else { if (opts.ec_format){ int breakIndex = line.indexOf(">"); sentenceID = line.substring(3,breakIndex-1); line = line.substring(breakIndex+2, line.length()-5); } if (!opts.tokenize) sentence = Arrays.asList(line.split("\\s+")); else { sentence = tokenizer.tokenizeLine(line); } } // if (sentence.size()==0) { outputData.write("\n"); continue; }//break; if (sentence.size()>opts.maxLength) { outputData.write("(())\n"); if (opts.kbest > 1){ outputData.write("\n"); } System.err.println("Skipping sentence with "+sentence.size()+" words since it is too long."); continue; } if (opts.nThreads > 1){ m_parser.parseThisSentence(sentence); while (m_parser.hasNext()){ List<Tree<String>> parsedTrees = m_parser.getNext(); outputTrees(parsedTrees, outputData, parser, opts, "", sentenceID); } } else { List<Tree<String>> parsedTrees = null; if (opts.kbest > 1){ parsedTrees = parser.getKBestConstrainedParses(sentence, posTags, opts.kbest); if (parsedTrees.size()==0) { parsedTrees.add(new Tree<String>("ROOT")); } } else { parsedTrees = new ArrayList<Tree<String>>(); Tree<String> parsedTree = parser.getBestConstrainedParse(sentence,posTags,null); if (opts.goldPOS && parsedTree.getChildren().isEmpty()){ // parse error when using goldPOS, try without parsedTree = parser.getBestConstrainedParse(sentence,null,null); } parsedTrees.add(parsedTree); } outputTrees(parsedTrees, outputData, parser, opts, line, sentenceID); } } if (opts.nThreads > 1){ while(!m_parser.isDone()) { while (m_parser.hasNext()){ List<Tree<String>> parsedTrees = m_parser.getNext(); outputTrees(parsedTrees, outputData, parser, opts, line, sentenceID); } } } if (opts.dumpPosteriors){ String fileName = opts.grFileName + ".posteriors"; parser.dumpPosteriors(fileName, -1); } outputData.flush(); outputData.close(); } catch (Exception ex) { ex.printStackTrace(); } System.exit(0); } /** * @param parsedTree * @param outputData * @param opts */ private static void outputTrees(List<Tree<String>> parseTrees, PrintWriter outputData, CoarseToFineMaxRuleParser parser, edu.berkeley.nlp.PCFGLA.BerkeleyParser.Options opts, String line, String sentenceID) { String delimiter = "\t"; if (opts.ec_format){ outputData.write(parseTrees.size() +"\t" + sentenceID + "\n"); delimiter = ",\t"; } for (Tree<String> parsedTree : parseTrees){ boolean addDelimiter = false; if (opts.tree_likelihood){ double treeLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getLogLikelihood(parsedTree); outputData.write(treeLL+""); addDelimiter = true; } if (opts.sentence_likelihood){ double allLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getLogLikelihood(); if (addDelimiter) outputData.write(delimiter); addDelimiter = true; if (opts.ec_format) outputData.write("sentenceLikelihood "); outputData.write(allLL+""); } if (!opts.binarize) parsedTree = TreeAnnotations.unAnnotateTree(parsedTree); if (opts.confidence) { double treeLL = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getConfidence(parsedTree); if (addDelimiter) outputData.write(delimiter); addDelimiter = true; if (opts.ec_format) outputData.write("confidence "); outputData.write(treeLL+""); } else if (opts.modelScore) { double score = (parsedTree.getChildren().isEmpty()) ? Double.NEGATIVE_INFINITY : parser.getModelScore(parsedTree); if (addDelimiter) outputData.write(delimiter); addDelimiter = true; if (opts.ec_format) outputData.write("maxRuleScore "); outputData.write(String.format("%.8f", score)); } if (opts.ec_format) outputData.write("\n"); else if (addDelimiter) outputData.write(delimiter); if (!parsedTree.getChildren().isEmpty()) { String treeString = parsedTree.getChildren().get(0).toString(); if (opts.ec_format) outputData.write("(S1 "+treeString+" )\n"); else outputData.write("( "+treeString+" )\n"); } else { outputData.write("(())\n"); } if (opts.render) try { writeTreeToImage(parsedTree,line.replaceAll("[^a-zA-Z]", "")+".png"); } catch (IOException e) { e.printStackTrace(); } } if (opts.dumpPosteriors){ int blockSize = 50; String fileName = opts.grFileName + ".posteriors"; parser.dumpPosteriors(fileName, blockSize); } if (opts.kbest > 1) outputData.write("\n"); outputData.flush(); } public static void writeTreeToImage(Tree<String> tree, String fileName) throws IOException{ tjp.setTree(tree); BufferedImage bi =new BufferedImage(tjp.width(),tjp.height(),BufferedImage.TYPE_INT_ARGB); int t=tjp.height(); Graphics2D g2 = bi.createGraphics(); g2.setComposite(AlphaComposite.getInstance(AlphaComposite.CLEAR, 1.0f)); Rectangle2D.Double rect = new Rectangle2D.Double(0,0,tjp.width(),tjp.height()); g2.fill(rect); g2.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, 1.0f)); tjp.paintComponent(g2); //paint the graphic to the offscreen image g2.dispose(); ImageIO.write(bi,"png",new File(fileName)); //save as png format DONE! } }