CharacterLevelTagExtender.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.pennchinese; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.stats.EquivalenceClassEval;
import edu.stanford.nlp.trees.*;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * A transformer to extend tags down to the level of individual characters.
 * Each word preterminal is split into new preterminals for each character
 * with tags corresponding to the original preterminal tag plus a suffix
 * depending on the position of the character in the word: _S for single-char
 * words, _B for first char of multi-char words, _M for middle chars and _E
 * for final chars.
 * <p/>
 * This is used in combining Chinese parsing and word segmentation using the
 * method of Luo '03.
 * <p/>
 * Note: it implements TreeTransformer because we might want to do away
 * with TreeNormalizers in favor of TreeTransformers
 *
 * @author Galen Andrew (galand@cs.stanford.edu) Date: May 13, 2004
 */
public class CharacterLevelTagExtender extends BobChrisTreeNormalizer implements TreeTransformer  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(CharacterLevelTagExtender.class);

  private static final long serialVersionUID = 7893996593626523700L;

  private static final boolean useTwoCharTags = false;

  public CharacterLevelTagExtender() {
    super(new ChineseTreebankLanguagePack());
  }

  public CharacterLevelTagExtender(TreebankLanguagePack tlp) {
    super(tlp);
  }

  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    return transformTree(super.normalizeWholeTree(tree,tf));
  }

  //  static Set preterminals = new HashSet();

  public Tree transformTree(Tree tree) {
    TreeFactory tf = tree.treeFactory();
    String tag = tree.label().value();
    if (tree.isPreTerminal()) {
      String word = tree.firstChild().label().value();

      List<Tree> newPreterms = new ArrayList<>();
      for (int i = 0, size = word.length(); i < size; i++) {
        String singleCharLabel = new String(new char[]{word.charAt(i)});
        Tree newLeaf = tf.newLeaf(singleCharLabel);
        String suffix;
        if (useTwoCharTags) {
          if (word.length() == 1 || i == 0) {
            suffix = "_S";
          } else {
            suffix = "_M";
          }
        } else {
          if (word.length() == 1) {
            suffix = "_S";
          } else if (i == 0) {
            suffix = "_B";
          } else if (i == word.length() - 1) {
            suffix = "_E";
          } else {
            suffix = "_M";
          }
        }
        newPreterms.add(tf.newTreeNode(tag + suffix, Collections.<Tree>singletonList(newLeaf)));
      }
      return tf.newTreeNode(tag, newPreterms);
    } else {
      List<Tree> newChildren = new ArrayList<>();
      for (int i = 0; i < tree.children().length; i++) {
        Tree child = tree.children()[i];
        newChildren.add(transformTree(child));
      }
      return tf.newTreeNode(tag, newChildren);
    }
  }

  public Tree untransformTree(Tree tree) {
    TreeFactory tf = tree.treeFactory();
    if (tree.isPrePreTerminal()) {
      if (tree.firstChild().label().value().matches(".*_.")) {
        StringBuilder word = new StringBuilder();
        for (int i = 0; i < tree.children().length; i++) {
          Tree child = tree.children()[i];
          word.append(child.firstChild().label().value());
        }
        Tree newChild = tf.newLeaf(word.toString());
        tree.setChildren(Collections.singletonList(newChild));
      }
    } else {
      for (int i = 0; i < tree.children().length; i++) {
        Tree child = tree.children()[i];
        untransformTree(child);
      }
    }
    return tree;
  }

  private static void testTransAndUntrans(CharacterLevelTagExtender e, Treebank tb, PrintWriter pw) {
    for (Tree tree : tb) {
      Tree oldTree = tree.treeSkeletonCopy();
      e.transformTree(tree);
      e.untransformTree(tree);
      if (!tree.equals(oldTree)) {
        pw.println("NOT EQUAL AFTER UNTRANSFORMATION!!!");
        pw.println();
        oldTree.pennPrint(pw);
        pw.println();
        tree.pennPrint(pw);
        pw.println("------------------");
      }
    }
  }

  /**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      throw new RuntimeException("args: treebankPath trainNums testNums");
    }

    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;

    LexicalizedParser lp;
    try {
      FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);

      lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
      try {
        String filename = "chineseCharTagPCFG.ser.gz";
        log.info("Writing parser in serialized format to file " + filename + " ");
        System.err.flush();
        ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

        out.writeObject(lp);
        out.close();
        log.info("done.");
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    } catch (IllegalArgumentException e) {
      lp = LexicalizedParser.loadModel(args[1], op);
    }

    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
      Tree tree;
      try {
        tree = lp.parseTree(gold.yieldHasWord());
        if (tree == null) {
          System.out.println("Failed to parse " + gold.yieldHasWord());
          continue;
        }
      } catch (Exception e) {
        e.printStackTrace();
        continue;
      }
      gold = gold.firstChild();
      pw.println(SentenceUtils.listToString(gold.preTerminalYield()));
      pw.println(SentenceUtils.listToString(gold.yield()));
      gold.pennPrint(pw);

      pw.println(tree.preTerminalYield());
      pw.println(tree.yield());
      tree.pennPrint(pw);
      //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
      //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
      //      eval.eval(allBrackets, goldBrackets);
      eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
  }

}