/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.penntree; import static java.util.Collections.singletonList; import static org.apache.uima.fit.util.FSCollectionFactory.create; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Stack; import java.util.StringTokenizer; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.mutable.MutableInt; import org.apache.uima.cas.FeatureStructure; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; public class PennTreeUtils { private static final Map<String, String> ESCAPE = new HashMap<String, String>(); private static final Map<String, String> UNESCAPE = new HashMap<String, String>(); static { ESCAPE.put("(", "-LRB-"); ESCAPE.put(")", "-RRB-"); } static { UNESCAPE.put("-LRB-", "("); UNESCAPE.put("-RRB-", ")"); } public static PennTreeNode convertPennTree(Constituent aConstituent) { PennTreeNode node = new PennTreeNode(); if (aConstituent.getSyntacticFunction() != null) { node.setLabel(aConstituent.getConstituentType() + '-' + aConstituent.getSyntacticFunction()); } else { node.setLabel(aConstituent.getConstituentType()); } List<PennTreeNode> children = new ArrayList<PennTreeNode>(); for (FeatureStructure c : create(aConstituent.getChildren())) { if (c instanceof Constituent) { children.add(convertPennTree((Constituent) c)); } if (c instanceof Token) { Token t = (Token) c; PennTreeNode term = new PennTreeNode(); term.setLabel(escapeToken(t.getCoveredText())); PennTreeNode preterm = new PennTreeNode(); preterm.setLabel(t.getPos().getPosValue()); preterm.setChildren(singletonList(term)); children.add(preterm); } } node.setChildren(children); return node; } public static String escapeToken(String aToken) { String value = ESCAPE.get(aToken); return value == null ? aToken : value; } public static String unescapeToken(String aToken) { String value = UNESCAPE.get(aToken); return value == null ? aToken : value; } public static String toText(PennTreeNode aNode) { StringBuilder buf = new StringBuilder(); toText(buf, aNode); return buf.toString(); } private static void toText(StringBuilder aBuffer, PennTreeNode aNode) { if (aNode.isTerminal()) { if (aBuffer.length() > 0) { aBuffer.append(" "); } aBuffer.append(unescapeToken(aNode.getLabel())); } else { for (PennTreeNode n : aNode.getChildren()) { toText(aBuffer, n); } } } public static PennTreeNode selectDfs(PennTreeNode aNode, int aIndex) { return dfs(aIndex, new MutableInt(0), aNode); } private static PennTreeNode dfs(int aTarget, MutableInt aIndex, PennTreeNode aNode) { if (aTarget == aIndex.intValue()) { return aNode; } for (PennTreeNode n : aNode.getChildren()) { aIndex.increment(); PennTreeNode r = dfs(aTarget, aIndex, n); if (r != null) { return r; } } return null; } public static PennTreeNode parsePennTree(String aTree) { StringTokenizer st = new StringTokenizer(aTree, "() ", true); PennTreeNode root = null; Stack<PennTreeNode> stack = new Stack<PennTreeNode>(); boolean seenLabel = false; while (st.hasMoreTokens()) { String t = st.nextToken().trim(); if (t.length() == 0) { // Skip } else if ("(".equals(t)) { PennTreeNode n = new PennTreeNode(); stack.push(n); if (root == null) { root = n; } seenLabel = false; } else if (")".equals(t)) { PennTreeNode n = stack.pop(); if (!stack.isEmpty()) { PennTreeNode p = stack.peek(); p.addChild(n); } } else if (seenLabel) { // If the node has two labels, its a leaf, add a new terminal node then. PennTreeNode p = stack.peek(); PennTreeNode n = new PennTreeNode(); n.setLabel(t); p.addChild(n); } else { PennTreeNode n = stack.peek(); n.setLabel(t); seenLabel = true; } } return root; } public static String toPennTree(PennTreeNode aNode) { StringBuilder sb = new StringBuilder(); toPennTree(sb, aNode, -1); return sb.toString().trim(); } public static String toPrettyPennTree(PennTreeNode aNode) { StringBuilder sb = new StringBuilder(); toPennTree(sb, aNode, 0); return sb.toString().trim(); } private static void toPennTree(StringBuilder aSb, PennTreeNode aNode, int aLevel) { boolean indentationEnabled = aLevel >= 0; // This is a "(Label Token)" if (aNode.isPreTerminal()) { aSb.append('('); aSb.append(aNode.getLabel()); aSb.append(' '); aSb.append(aNode.getChildren().get(0).getLabel()); aSb.append(')'); } else { if (indentationEnabled) { aSb.append(StringUtils.repeat(" ", aLevel * 2)); } aSb.append('('); aSb.append(aNode.getLabel()); PennTreeNode prevChild = null; Iterator<PennTreeNode> i = aNode.getChildren().iterator(); while (i.hasNext()) { PennTreeNode child = i.next(); if (indentationEnabled && !child.isPreTerminal()) { aSb.append('\n'); } else if (indentationEnabled && prevChild != null && !prevChild.isPreTerminal()) { aSb.append('\n'); aSb.append(StringUtils.repeat(" ", (aLevel+1) * 2)); } else { aSb.append(' '); } toPennTree(aSb, child, indentationEnabled ? aLevel + 1 : -1); prevChild = child; } aSb.append(')'); } } /** * Remove trailing or leading whitespace from the annotation. * * @param aText * the text. * @param aSpan * the offsets (will be updated in the process). */ public static void trim(CharSequence aText, int[] aSpan) { int begin = aSpan[0]; int end = aSpan[1]-1; CharSequence data = aText; while ( (begin < (data.length()-1)) && trimChar(data.charAt(begin)) ) { begin ++; } while ( (end > 0) && trimChar(data.charAt(end)) ) { end --; } end++; aSpan[0] = begin; aSpan[1] = end; } public static boolean isEmpty(int aBegin, int aEnd) { return aBegin >= aEnd; } public static boolean trimChar(final char aChar) { switch (aChar) { case '\n': return true; // Line break case '\r': return true; // Carriage return case '\t': return true; // Tab case '\u200E': return true; // LEFT-TO-RIGHT MARK case '\u200F': return true; // RIGHT-TO-LEFT MARK case '\u2028': return true; // LINE SEPARATOR case '\u2029': return true; // PARAGRAPH SEPARATOR default: return Character.isWhitespace(aChar); } } public static List<PennTreeNode> getPreTerminals(PennTreeNode aNode) { List<PennTreeNode> preTerminals = new ArrayList<>(); getPreTerminals(aNode, preTerminals); return preTerminals; } private static void getPreTerminals(PennTreeNode aNode, List<PennTreeNode> aList) { if (aNode.isPreTerminal()) { aList.add(aNode); } else { for (PennTreeNode n : aNode.getChildren()) { getPreTerminals(n, aList); } } } }