package edu.stanford.nlp.trees.international.arabic; import edu.stanford.nlp.util.logging.Redwood; import java.util.regex.Pattern; import edu.stanford.nlp.trees.AbstractCollinsHeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.util.Generics; /** * Find the head of an Arabic tree, using the usual kind of heuristic * head finding rules. * <p> * <i>Implementation notes.</i> * TO DO: make sure that -PRD marked elements are always chosen as heads. * (Has this now been successfully done or not??) * <p> * Mona: I added the 8 new Nonterm for the merged DT with its following * category as a rule the DT nonterm is right headed, the 8 new nonterm DTs * are: DTCD, DTRB, DTRP, DTJJ, DTNN, DTNNS, DTNNP, DTNNPS. * This was added Dec 7th, 2004. * * @author Roger Levy * @author Mona Diab * @author Christopher Manning (added new stuff for ATBp3v3 */ public class ArabicHeadFinder extends AbstractCollinsHeadFinder { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicHeadFinder.class); private static final long serialVersionUID = 6203368998430280740L; protected TagSet tagSet; /* A work in progress. There may well be a better way to parameterize the HeadFinders via tagset. */ public enum TagSet { BIES_COLLAPSED { @Override String prep() { return "IN"; } @Override String noun() { return "NN"; } // really there should be several here. @Override String det() { return "DT"; } @Override String adj() { return "JJ"; } @Override String detPlusNoun() { return "DTNN"; } // really there should be several here; major point is that the det part is ignored completely @Override TreebankLanguagePack langPack() { return new ArabicTreebankLanguagePack(); } }, ORIGINAL { @Override String prep() { return "PREP"; } @Override String noun() { return "NOUN"; } @Override String det() { return "DET"; } @Override String adj() { return "ADJ"; } @Override String detPlusNoun() { return "DET+NN"; } @Override TreebankLanguagePack langPack() { return new ArabicTreebankLanguagePack(); } }; abstract String prep(); abstract String noun(); abstract String adj(); abstract String det(); abstract String detPlusNoun(); abstract TreebankLanguagePack langPack(); static TagSet tagSet(String str) { switch (str) { case "BIES_COLLAPSED": return BIES_COLLAPSED; case "ORIGINAL": return ORIGINAL; default: throw new IllegalArgumentException("Don't know anything about tagset " + str); } } } public ArabicHeadFinder() { this(new ArabicTreebankLanguagePack()); } /** * Construct an ArabicHeadFinder with a String parameter corresponding to the tagset in use. * @param tagSet Either "ORIGINAL" or "BIES_COLLAPSED" */ public ArabicHeadFinder(String tagSet) { this(TagSet.tagSet(tagSet)); } public ArabicHeadFinder(TagSet tagSet) { this(tagSet.langPack(), tagSet); //this(new ArabicTreebankLanguagePack(), tagSet); } public ArabicHeadFinder(TreebankLanguagePack tlp) { this(tlp,TagSet.BIES_COLLAPSED); } protected ArabicHeadFinder(TreebankLanguagePack tlp, TagSet tagSet) { super(tlp); this.tagSet = tagSet; //log.info("##testing: noun tag is " + tagSet.noun()); nonTerminalInfo = Generics.newHashMap(); nonTerminalInfo.put("NX", new String[][]{{"left", "DT","DTNN","DTNNS","DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT", "MWNP"}}); nonTerminalInfo.put("ADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR", "MWADJP"}, {"right", "ADJP", "VN", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed?? nonTerminalInfo.put("MWADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR"}, {"right", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed?? nonTerminalInfo.put("ADVP", new String[][]{{"left", "WRB", "RB", "MWADVP", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...) nonTerminalInfo.put("MWADVP", new String[][]{{"left", "WRB", "RB", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...) nonTerminalInfo.put("CONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("MWCONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("FRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}}); nonTerminalInfo.put("MWFRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}}); nonTerminalInfo.put("INTJ", new String[][]{{"left", "RP", "UH", "DTRP"}}); nonTerminalInfo.put("LST", new String[][]{{"left"}}); nonTerminalInfo.put("NAC", new String[][]{{"left", "NP", "SBAR", "PP", "MWP","ADJP", "S", "PRT", "UCP"}, {"left", "ADVP"}}); // note: maybe CC, RB should be the heads? nonTerminalInfo.put("NP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "NP", "PRP", "WHNP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "NOFUNC", "NO_FUNC", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right? nonTerminalInfo.put("MWNP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "PRP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right? nonTerminalInfo.put("PP", new String[][]{{"left", tagSet.prep(), "MWPP", "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT nonTerminalInfo.put("MWPP", new String[][]{{"left", tagSet.prep(), "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT nonTerminalInfo.put("PRN", new String[][]{{"left", "NP"}}); // don't get PUNC nonTerminalInfo.put("MWPRN", new String[][]{{"left", "IN"}}); // don't get PUNC nonTerminalInfo.put("PRT", new String[][]{{"left", "RP", "PRT", "IN", "DTRP"}}); nonTerminalInfo.put("QP", new String[][]{{"right", "CD", "DTCD", tagSet.noun(), "MWNP", tagSet.adj(), "MWADJP", "NNS", "NNP", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("S", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here! nonTerminalInfo.put("MWS", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here! nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "MWVP", "PP", "MWP"}}); // to be principled, we need -PRD sensitivity here too. nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); nonTerminalInfo.put("MWSBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); nonTerminalInfo.put("SBARQ", new String[][]{{"left", "WHNP", "WHADVP", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS","DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); // copied from SBAR rule -- look more closely when there's time nonTerminalInfo.put("UCP", new String[][]{{"left"}}); nonTerminalInfo.put("VP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information? nonTerminalInfo.put("MWVP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "MWVP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information? //also, RB is used as gerunds nonTerminalInfo.put("WHADVP", new String[][]{{"left", "WRB", "WP"}, {"right", "CC", "MWCONJP"}, {"left", "IN"}}); nonTerminalInfo.put("WHNP", new String[][]{{"right", "WP"}}); nonTerminalInfo.put("WHPP", new String[][]{{"left", "IN", "MWPP", "RB", "MWADVP"}}); nonTerminalInfo.put("X", new String[][]{{"left"}}); //Added by Mona 12/7/04 for the newly created DT nonterm cat nonTerminalInfo.put("DTNN", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNS", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNP", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNPS", new String[][]{{"right"}}); nonTerminalInfo.put("DTJJ", new String[][]{{"right"}}); nonTerminalInfo.put("DTRP", new String[][]{{"right"}}); nonTerminalInfo.put("DTRB", new String[][]{{"right"}}); nonTerminalInfo.put("DTCD", new String[][]{{"right"}}); nonTerminalInfo.put("DTIN", new String[][]{{"right"}}); // stand-in dependency: nonTerminalInfo.put("EDITED", new String[][]{{"left"}}); nonTerminalInfo.put(tlp.startSymbol(), new String[][]{{"left"}}); // one stray SINV in the training set...garbage head rule here. nonTerminalInfo.put("SINV", new String[][]{{"left","ADJP","VP"}}); } private final Pattern predPattern = Pattern.compile(".*-PRD$"); /** * Predicatively marked elements in a sentence should be noted as heads */ @Override protected Tree findMarkedHead(Tree t) { String cat = t.value(); if (cat.equals("S")) { Tree[] kids = t.children(); for (Tree kid : kids) { if (predPattern.matcher(kid.value()).matches()) { return kid; } } } return null; } }