package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.NumberRangesFileFilter; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.parser.lexparser.Options; import edu.stanford.nlp.stats.EquivalenceClassEval; import edu.stanford.nlp.trees.*; import java.io.*; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * A transformer to extend tags down to the level of individual characters. * Each word preterminal is split into new preterminals for each character * with tags corresponding to the original preterminal tag plus a suffix * depending on the position of the character in the word: _S for single-char * words, _B for first char of multi-char words, _M for middle chars and _E * for final chars. * <p/> * This is used in combining Chinese parsing and word segmentation using the * method of Luo '03. * <p/> * Note: it implements TreeTransformer because we might want to do away * with TreeNormalizers in favor of TreeTransformers * * @author Galen Andrew (galand@cs.stanford.edu) Date: May 13, 2004 */ public class CharacterLevelTagExtender extends BobChrisTreeNormalizer implements TreeTransformer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(CharacterLevelTagExtender.class); private static final long serialVersionUID = 7893996593626523700L; private static final boolean useTwoCharTags = false; public CharacterLevelTagExtender() { super(new ChineseTreebankLanguagePack()); } public CharacterLevelTagExtender(TreebankLanguagePack tlp) { super(tlp); } @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { return transformTree(super.normalizeWholeTree(tree,tf)); } // static Set preterminals = new HashSet(); public Tree transformTree(Tree tree) { TreeFactory tf = tree.treeFactory(); String tag = tree.label().value(); if (tree.isPreTerminal()) { String word = tree.firstChild().label().value(); List<Tree> newPreterms = new ArrayList<>(); for (int i = 0, size = word.length(); i < size; i++) { String singleCharLabel = new String(new char[]{word.charAt(i)}); Tree newLeaf = tf.newLeaf(singleCharLabel); String suffix; if (useTwoCharTags) { if (word.length() == 1 || i == 0) { suffix = "_S"; } else { suffix = "_M"; } } else { if (word.length() == 1) { suffix = "_S"; } else if (i == 0) { suffix = "_B"; } else if (i == word.length() - 1) { suffix = "_E"; } else { suffix = "_M"; } } newPreterms.add(tf.newTreeNode(tag + suffix, Collections.<Tree>singletonList(newLeaf))); } return tf.newTreeNode(tag, newPreterms); } else { List<Tree> newChildren = new ArrayList<>(); for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; newChildren.add(transformTree(child)); } return tf.newTreeNode(tag, newChildren); } } public Tree untransformTree(Tree tree) { TreeFactory tf = tree.treeFactory(); if (tree.isPrePreTerminal()) { if (tree.firstChild().label().value().matches(".*_.")) { StringBuilder word = new StringBuilder(); for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; word.append(child.firstChild().label().value()); } Tree newChild = tf.newLeaf(word.toString()); tree.setChildren(Collections.singletonList(newChild)); } } else { for (int i = 0; i < tree.children().length; i++) { Tree child = tree.children()[i]; untransformTree(child); } } return tree; } private static void testTransAndUntrans(CharacterLevelTagExtender e, Treebank tb, PrintWriter pw) { for (Tree tree : tb) { Tree oldTree = tree.treeSkeletonCopy(); e.transformTree(tree); e.untransformTree(tree); if (!tree.equals(oldTree)) { pw.println("NOT EQUAL AFTER UNTRANSFORMATION!!!"); pw.println(); oldTree.pennPrint(pw); pw.println(); tree.pennPrint(pw); pw.println("------------------"); } } } /** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; log.info("Writing parser in serialized format to file " + filename + " "); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); log.info("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(SentenceUtils.listToString(gold.preTerminalYield())); pw.println(SentenceUtils.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); } }