package edu.stanford.nlp.trees.international.arabic; import edu.stanford.nlp.util.logging.Redwood; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Pattern; import edu.stanford.nlp.international.arabic.pipeline.DefaultLexicalMapper; import edu.stanford.nlp.international.morph.MorphoFeatureSpecification; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasTag; import edu.stanford.nlp.trees.treebank.Mapper; import edu.stanford.nlp.trees.BobChrisTreeNormalizer; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import java.util.function.Predicate; import edu.stanford.nlp.util.Pair; /** * Normalizes both terminals and non-terminals in Penn Arabic Treebank (ATB) * trees. Among the normalizations that can be performed: * * <ul> * <li> Adds a ROOT node to the top of every tree * <li> Strips all the interesting stuff off of the POS tags. * <li> Can keep NP-TMP annotations (retainNPTmp parameter) * <li> Can keep whatever annotations there are on verbs that are sisters * to predicatively marked (-PRD) elements (markPRDverb parameter) * [Chris Nov 2006: I'm a bit unsure on that one!] * <li> Can keep categories unchanged, i.e., not mapped to basic categories * (changeNoLabels parameter) * <li> Counts pronoun deletions ("nullp" and "_") as empty; filters * </ul> * * @author Roger Levy * @author Anna Rafferty * @author Spence Green */ public class ArabicTreeNormalizer extends BobChrisTreeNormalizer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicTreeNormalizer.class); private final boolean retainNPTmp; private final boolean retainNPSbj; private final boolean markPRDverb; private final boolean changeNoLabels; private final boolean retainPPClr; private final Pattern prdPattern; private final TregexPattern prdVerbPattern; private final TregexPattern npSbjPattern; private final String rootLabel; private final Mapper lexMapper = new DefaultLexicalMapper(); public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb, boolean changeNoLabels, boolean retainNPSbj, boolean retainPPClr) { super(new ArabicTreebankLanguagePack()); this.retainNPTmp = retainNPTmp; this.retainNPSbj = retainNPSbj; this.markPRDverb = markPRDverb; this.changeNoLabels = changeNoLabels; this.retainPPClr = retainPPClr; rootLabel = tlp.startSymbol(); prdVerbPattern = TregexPattern.compile("/^V[^P]/ > VP $ /-PRD$/=prd"); prdPattern = Pattern.compile("^[A-Z]+-PRD"); //Marks NP subjects that *do not* occur in verb-initial clauses npSbjPattern = TregexPattern.compile("/^NP-SBJ/ !> @VP"); emptyFilter = new ArabicEmptyFilter(); } public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb, boolean changeNoLabels) { this(retainNPTmp, markPRDverb, changeNoLabels, false, false); } public ArabicTreeNormalizer(boolean retainNPTmp, boolean markPRDverb) { this(retainNPTmp,markPRDverb,false); } public ArabicTreeNormalizer(boolean retainNPTmp) { this(retainNPTmp,false); } public ArabicTreeNormalizer() { this(false); } @Override public String normalizeNonterminal(String category) { String normalizedString; if (changeNoLabels) { normalizedString = category; } else if (retainNPTmp && category != null && category.startsWith("NP-TMP")) { normalizedString = "NP-TMP"; } else if (retainNPSbj && category != null && category.startsWith("NP-SBJ")) { normalizedString = "NP-SBJ"; } else if (retainPPClr && category != null && category.startsWith("PP-CLR")) { normalizedString = "PP-CLR"; } else if (markPRDverb && category != null && prdPattern.matcher(category).matches()) { normalizedString = category; } else { // otherwise, return the basicCategory (and turn null to ROOT) normalizedString = super.normalizeNonterminal(category); } return normalizedString.intern(); } @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { tree = tree.prune(emptyFilter, tf).spliceOut(aOverAFilter, tf); for (Tree t : tree) { if(t.isLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if(t.value().contains(MorphoFeatureSpecification.MORPHO_MARK)) { String[] toks = t.value().split(MorphoFeatureSpecification.MORPHO_MARK); if(toks.length != 2) System.err.printf("%s: Word contains malformed morph annotation: %s%n",this.getClass().getName(),t.value()); else if(t.label() instanceof CoreLabel) { ((CoreLabel) t.label()).setValue(toks[0].trim().intern()); ((CoreLabel) t.label()).setWord(toks[0].trim().intern()); Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(toks[0], toks[1]); String lemma = lemmaMorph.first(); String morphAnalysis = lemmaMorph.second(); if (lemma.equals(toks[0])) { ((CoreLabel) t.label()).setOriginalText(toks[1].trim().intern()); } else { // TODO(speneg): Does this help? String newLemma = lexMapper.map(null, lemma); if (newLemma == null || newLemma.trim().length() == 0) { newLemma = lemma; } String newMorphAnalysis = newLemma + MorphoFeatureSpecification.LEMMA_MARK + morphAnalysis; ((CoreLabel) t.label()).setOriginalText(newMorphAnalysis.intern()); } } else { System.err.printf("%s: Cannot store morph analysis in non-CoreLabel: %s%n",this.getClass().getName(),t.label().getClass().getName()); } } } else if (t.isPreTerminal()) { if (t.value() == null || t.value().equals("")) { System.err.printf("%s: missing tag for\n%s\n",this.getClass().getName(),t.pennString()); } else if(t.label() instanceof HasTag) { ((HasTag) t.label()).setTag(t.value()); } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.numChildren(); List<Tree> newKids = new ArrayList<>(nk); for (int j = 0; j < nk; j++) { Tree child = t.getChild(j); if (child.isLeaf()) { System.err.printf("%s: Splicing in DUMMYTAG for%n%s%n",this.getClass().getName(),t.toString()); newKids.add(tf.newTreeNode("DUMMYTAG", Collections.singletonList(child))); } else { newKids.add(child); } } t.setChildren(newKids); } }//Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.matcher(tree); Tree match = null; while (m.find()) { if (m.getMatch() != match) { match = m.getMatch(); match.label().setValue(match.label().value() + "-PRDverb"); Tree prd = m.getNode("prd"); prd.label().setValue(super.normalizeNonterminal(prd.label().value())); } } } //Mark *only* subjects in verb-initial clauses if(retainNPSbj) { TregexMatcher m = npSbjPattern.matcher(tree); while (m.find()) { Tree match = m.getMatch(); match.label().setValue("NP"); } } if (tree.isPreTerminal()) { // The whole tree is a bare tag: bad! String val = tree.label().value(); if (val.equals("CC") || val.startsWith("PUNC") || val.equals("CONJ")) { System.err.printf("%s: Bare tagged word being wrapped in FRAG\n%s\n",this.getClass().getName(),tree.pennString()); tree = tf.newTreeNode("FRAG", Collections.singletonList(tree)); } else { System.err.printf("%s: Bare tagged word\n%s\n",this.getClass().getName(),tree.pennString()); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while(tree != null && (tree.value() == null || tree.value().equals("")) && tree.numChildren() <= 1) tree = tree.firstChild(); if(tree != null && !tree.value().equals(rootLabel)) tree = tf.newTreeNode(rootLabel, Collections.singletonList(tree)); return tree; } /** * Remove traces and pronoun deletion markers. */ public static class ArabicEmptyFilter implements Predicate<Tree>, Serializable { private static final long serialVersionUID = 7417844982953945964L; public boolean test(Tree t) { // Pronoun deletions if(t.isPreTerminal() && (t.value().equals("PRON_1S") || t.value().equals("PRP")) && (t.firstChild().value().equals("nullp") || t.firstChild().value().equals("نللة") || t.firstChild().value().equals("-~a"))) return false; // Traces else if(t.isPreTerminal() && t.value() != null && t.value().equals("-NONE-")) return false; return true; } } private static final long serialVersionUID = -1592231121068698494L; }