package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.trees.AbstractCollinsHeadFinder; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.util.Generics; /** * A headfinder implementing Dan Bikel's head rules. * March 2005: Updated to match the head-finding rules found in * Bikel's thesis (2004). * * @author Galen Andrew * @author Christopher Manning. */ public class BikelChineseHeadFinder extends AbstractCollinsHeadFinder { /** * */ private static final long serialVersionUID = -5445795668059315082L; public BikelChineseHeadFinder() { this(new ChineseTreebankLanguagePack()); } public BikelChineseHeadFinder(TreebankLanguagePack tlp) { super(tlp); nonTerminalInfo = Generics.newHashMap(); // these are first-cut rules defaultRule = new String[]{"right"}; // ROOT is not always unary for chinese -- PAIR is a special notation // that the Irish people use for non-unary ones.... nonTerminalInfo.put("ROOT", new String[][]{{"left", "IP"}}); nonTerminalInfo.put("PAIR", new String[][]{{"left", "IP"}}); // Major syntactic categories nonTerminalInfo.put("ADJP", new String[][]{{"right", "ADJP", "JJ"}, {"right", "AD", "NN", "CS"}}); nonTerminalInfo.put("ADVP", new String[][]{{"right", "ADVP", "AD"}}); nonTerminalInfo.put("CLP", new String[][]{{"right", "CLP", "M"}}); nonTerminalInfo.put("CP", new String[][]{{"right", "DEC", "SP"}, {"left", "ADVP", "CS"}, {"right", "CP", "IP"}}); nonTerminalInfo.put("DNP", new String[][]{{"right", "DNP", "DEG"}, {"right", "DEC"}}); nonTerminalInfo.put("DP", new String[][]{{"left", "DP", "DT"}}); nonTerminalInfo.put("DVP", new String[][]{{"right", "DVP", "DEV"}}); nonTerminalInfo.put("FRAG", new String[][]{{"right", "VV", "NR", "NN"}}); nonTerminalInfo.put("INTJ", new String[][]{{"right", "INTJ", "IJ"}}); nonTerminalInfo.put("IP", new String[][]{{"right", "IP", "VP"}, {"right", "VV"}}); nonTerminalInfo.put("LCP", new String[][]{{"right", "LCP", "LC"}}); nonTerminalInfo.put("LST", new String[][]{{"left", "LST", "CD", "OD"}}); nonTerminalInfo.put("NP", new String[][]{{"right", "NP", "NN", "NT", "NR", "QP"}}); nonTerminalInfo.put("PP", new String[][]{{"left", "PP", "P"}}); nonTerminalInfo.put("PRN", new String[][]{{"right", "NP", "IP", "VP", "NT", "NR", "NN"}}); nonTerminalInfo.put("QP", new String[][]{{"right", "QP", "CLP", "CD", "OD"}}); nonTerminalInfo.put("UCP", new String[][]{{"right"}}); nonTerminalInfo.put("VP", new String[][]{{"left", "VP", "VA", "VC", "VE", "VV", "BA", "LB", "VCD", "VSB", "VRD", "VNV", "VCP"}}); nonTerminalInfo.put("VCD", new String[][]{{"right", "VCD", "VV", "VA", "VC", "VE"}}); nonTerminalInfo.put("VCP", new String[][]{{"right", "VCP", "VV", "VA", "VC", "VE"}}); nonTerminalInfo.put("VRD", new String[][]{{"right", "VRD", "VV", "VA", "VC", "VE"}}); nonTerminalInfo.put("VSB", new String[][]{{"right", "VSB", "VV", "VA", "VC", "VE"}}); nonTerminalInfo.put("VNV", new String[][]{{"right", "VNV", "VV", "VA", "VC", "VE"}}); nonTerminalInfo.put("VPT", new String[][]{{"right", "VNV", "VV", "VA", "VC", "VE"}}); // VNV typo for VPT? None of either in ctb4. nonTerminalInfo.put("WHNP", new String[][]{{"right", "WHNP", "NP", "NN", "NT", "NR", "QP"}}); nonTerminalInfo.put("WHPP", new String[][]{{"left", "WHPP", "PP", "P"}}); // some POS tags apparently sit where phrases are supposed to be nonTerminalInfo.put("CD", new String[][]{{"right", "CD"}}); nonTerminalInfo.put("NN", new String[][]{{"right", "NN"}}); nonTerminalInfo.put("NR", new String[][]{{"right", "NR"}}); // parsing. It shouldn't affect anything else because heads of preterminals are not // generally queried - GMA nonTerminalInfo.put("VV", new String[][]{{"left"}}); nonTerminalInfo.put("VA", new String[][]{{"left"}}); nonTerminalInfo.put("VC", new String[][]{{"left"}}); nonTerminalInfo.put("VE", new String[][]{{"left"}}); } }