/** * Copyright (c) 2007, Regents of the University of Colorado All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. Redistributions in binary * form must reproduce the above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or other materials provided * with the distribution. Neither the name of the University of Colorado at * Boulder nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package clear.treebank; import clear.util.JFileTokenizer; import com.carrotsearch.hppc.IntObjectOpenHashMap; import java.util.ArrayList; import java.util.Arrays; /** * Treebank reader * * @author Jinho D. Choi <b>Last update:</b> 8/30/2010 */ public class TBReader { /** * Left round bracket "(" */ static final public String LRB = "("; /** * Right round bracket "(" */ static final public String RRB = ")"; /** * FileTokenizer to read the Treebank file */ private JFileTokenizer f_tree; /** * Initializes the Treebank reader. * * @param treeFile name of the Treebank file */ public TBReader(String treeFile) { String delim = LRB + RRB + JFileTokenizer.WHITE; f_tree = new JFileTokenizer(treeFile, delim, true); } /** * Returns the next tree in the Treebank. If there is none, returns null. */ public TBTree nextTree() { String str; do // find the first '(' { str = nextToken(); if (str == null) { f_tree.close(); return null; } } while (!str.equals(LRB)); int numBracket = 1; int terminalIndex = 0; int tokenIndex = 0; TBTree tree = new TBTree(); TBNode head = new TBNode(null, TBEnLib.POS_TOP); // dummy-head TBNode curr = head; // pointer to the current node while (true) { if ((str = nextToken()) == null) { errorMsg("more token needed"); } if (numBracket == 1 && str.equals(TBEnLib.POS_TOP)) { TBNode node = new TBNode(curr, str); // add a child to 'curr' curr.addChild(node); curr = node; // move to child } else if (str.equals(LRB)) { numBracket++; if ((str = nextToken()) == null) // str = pos-tag { errorMsg("POS-tag is missing"); } TBNode node = new TBNode(curr, str); // add a child to 'curr' curr.addChild(node); curr = node; // move to child } else if (str.equals(RRB)) { numBracket--; curr = curr.getParent(); // move to parent if (numBracket == 0) { break; } } else { curr.setForm(str); // str = word curr.terminalId = curr.headId = terminalIndex++; if (!curr.isEmptyCategory()) { curr.tokenId = tokenIndex++; } tree.addTerminalNode(curr); // add 'curr' as a leaf } } TBNode top = head.getChildren().get(0); // normalizeEC(top); if (top.isPos(TBEnLib.POS_TOP)) { top.setParent(null); tree.setRootNode(top); } else { tree.setRootNode(head); } return tree; } public void normalizeEC(TBNode root) { IntObjectOpenHashMap<ArrayList<TBNode>> map = new IntObjectOpenHashMap<>(); retrieveCoIndexMap(root, map); if (map.isEmpty()) { return; } int[] keys = map.keys().toArray(); ArrayList<TBNode> list; Arrays.sort(keys); int coIndex = 1, i; TBNode curr, ec; boolean isFirst; for (int key : keys) { list = map.get(key); isFirst = true; for (i = list.size() - 1; i >= 0; i--) { curr = list.get(i); if (curr.isEmptyCategoryRec() && i > 0) { ec = curr.getSubTerminalNodes().get(0); if (isFirst || ec.isForm("\\*ICH\\*|\\*RNR\\*")) { curr.coIndex = -1; ec.form += "-" + coIndex; isFirst = false; } else { curr.coIndex = coIndex++; ec.form += "-" + coIndex; } } else { curr.coIndex = coIndex; } } coIndex++; } } private void retrieveCoIndexMap(TBNode node, IntObjectOpenHashMap<ArrayList<TBNode>> map) { if (node.isPhrase()) { if (node.coIndex != -1) { int key = node.coIndex; ArrayList<TBNode> list; if (map.containsKey(key)) { list = map.get(key); } else { list = new ArrayList<>(); map.put(key, list); } list.add(node); } for (TBNode child : node.getChildren()) { retrieveCoIndexMap(child, map); } } } /** * Skips all white-spaces and returns the next token. If there is no such * token, returns null. */ private String nextToken() { while (f_tree.hasMoreTokens()) { String str = f_tree.nextToken(); if (JFileTokenizer.WHITE.indexOf(str) == -1) { return str; } } return null; } /** * Prints an error-message and exits. */ private void errorMsg(String msg) { System.err.println("error: " + msg + " (line: " + f_tree.getLineNumber() + ")"); System.exit(1); } }