package edu.stanford.nlp.trees.international.arabic; import java.util.Arrays; import java.util.List; import java.util.Set; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.international.arabic.ArabicTreeNormalizer.ArabicEmptyFilter; import java.util.function.Predicate; import edu.stanford.nlp.util.Generics; /** * Various static convenience methods for processing Arabic parse trees. * * @author Spence Green * */ public class ATBTreeUtils { private static final Predicate<Tree> emptyFilter = new ArabicEmptyFilter(); private static final TreeFactory tf = new LabeledScoredTreeFactory(); //The default segmentation marker. Can be changed for processing e.g. IBM Arabic. public static String segMarker = "-"; //The default morpheme boundary marker. Present only in the vocalized sections. public static final String morphBoundary = "+"; //Global tag for all punctuation public static final String puncTag = "PUNC"; //Reserved tokens class private static final String reservedWordList = "-PLUS- -LRB- -RRB-"; public static final Set<String> reservedWords = Generics.newHashSet(); static { reservedWords.addAll(Arrays.asList(reservedWordList.split("\\s+"))); } private ATBTreeUtils() {} // static class /** * Escapes tokens from flat strings that are reserved for usage in the ATB. * * @param s - An Arabic string * @return A string with all reserved words replaced by the appropriate tokens */ public static String escape(String s) { if(s == null) return null; //LDC escape sequences (as of ATB3p3) s = s.replaceAll("\\(", "-LRB-"); s = s.replaceAll("\\)", "-RRB-"); s = s.replaceAll("\\+", "-PLUS-"); return s; } /** * Reverts escaping from a flat string. * * @param s - An Arabic string * @return A string with all reserved words inserted into the appropriate locations */ public static String unEscape(String s) { if(s == null) return null; //LDC escape sequences (as of ATB3p3) s = s.replaceAll("-LRB-", "("); s = s.replaceAll("-RRB-", ")"); s = s.replaceAll("-PLUS-", "+"); return s; } /** * Returns the string associated with the input parse tree. Traces and * ATB-specific escape sequences (e.g., "-RRB-" for ")") are removed. * * @param t - A parse tree * @return The yield of the input parse tree */ public static String flattenTree(Tree t) { t = t.prune(emptyFilter, tf); String flatString = SentenceUtils.listToString(t.yield()); return flatString; } /** * Converts a parse tree into a string of tokens. Each token is a word and * its POS tag separated by the delimiter specified by <code>separator</code> * * @param t - A parse tree * @param removeEscaping - If true, remove LDC escape characters. Otherwise, leave them. * @param separator Word/tag separator * @return A string of tagged words */ public static String taggedStringFromTree(Tree t, boolean removeEscaping, String separator) { t = t.prune(emptyFilter, tf); List<CoreLabel> taggedSentence = t.taggedLabeledYield(); for (CoreLabel token : taggedSentence) { String word = (removeEscaping) ? unEscape(token.word()) : token.word(); token.setWord(word); token.setValue(word); } return SentenceUtils.listToString(taggedSentence, false, separator); } public static void main(String[] args) { String debug = "( the big lion ) + (the small rabbit)"; String escaped = ATBTreeUtils.escape(debug); System.out.println(escaped); } }