/*
* Copyright 2011 chris.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* under the License.
*/
package statalign.postprocess.utils;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
import statalign.postprocess.plugins.TreeNode;
public class NewickParser {
private static enum TokenType {
SPACE, NAME, NUMBER, PAR_OPEN, PAR_CLOSE, COMMA, TWO_COLON, COLON_COMMA, UNKNOWN, END_OF_STREAM
}
private static class Token {
private TokenType type;
private String text;
public Token(TokenType type, String text) {
this.type = type;
this.text = text;
}
public TokenType getType() {
return type;
}
public String getText() {
return text;
}
@Override
public String toString() {
if (text != null)
return type.toString() + " " + text;
else
return type.toString();
}
}
private enum ParserState {
TREE_START, BRANCH_START, BRANCH_ADD, BRANCH_CLOSE, BRANCH_LEAF, LEAF, TREE_END, EOF
}
private Reader reader;
private StringBuilder sb;
private int bufferedChar;
private int lastRow, lastCol, row, col;
public NewickParser(String string) {
this(new StringReader(string));
//System.out.println(string);
}
public NewickParser(Reader reader) {
this.reader = reader;
sb = new StringBuilder();
bufferedChar = -1;
lastRow = lastCol = row = col = 0;
}
public void close() throws NewickParserException {
try {
reader.close();
}
catch (IOException e) {
throw new NewickParserException(e);
}
}
public TreeNode parse() throws NewickParserException {
Token token = null;
ParserState state = ParserState.TREE_START;
Set<String> names = new HashSet<String>();
TreeNode node = null;
TreeNode root = null;
int level = 0;
Stack<TreeNode> nodeStack = new Stack<TreeNode>();
while (state != ParserState.EOF) {
switch (state) {
case TREE_START:
root = null;
node = null;
level = 0;
nodeStack.clear();
token = nextTokenNoSpace();
switch (token.getType()) {
case PAR_OPEN: state = ParserState.BRANCH_START; break;
case COLON_COMMA: state = ParserState.TREE_END; break;
default: exception("Unexpected token: " + token);
}
break;
case BRANCH_START:
if (root != null)
nodeStack.push(root);
level++;
root = new TreeNode();
node = new TreeNode();
root.addChild(node);
token = nextTokenNoSpace();
switch (token.getType()) {
case PAR_OPEN: state = ParserState.BRANCH_START; break;
case COMMA: state = ParserState.BRANCH_ADD; break;
case PAR_CLOSE: state = ParserState.BRANCH_CLOSE; break;
case NAME:
case NUMBER:
case TWO_COLON: state = ParserState.BRANCH_LEAF; break;
default: exception("Unexpected token: " + token);
}
break;
case BRANCH_ADD:
node = new TreeNode();
root.addChild(node);
token = nextTokenNoSpace();
switch (token.getType()) {
case PAR_OPEN: state = ParserState.BRANCH_START; break;
case COMMA: state = ParserState.BRANCH_ADD; break;
case PAR_CLOSE: state = ParserState.BRANCH_CLOSE; break;
case NAME:
case NUMBER:
case TWO_COLON: state = ParserState.BRANCH_LEAF; break;
default: exception("Unexpected token: " + token);
}
break;
case BRANCH_LEAF:
token = parseNodeName(node, token, names);
switch (token.getType()) {
case COMMA: state = ParserState.BRANCH_ADD; break;
case PAR_CLOSE: state = ParserState.BRANCH_CLOSE; break;
default: exception("Unexpected token: " + token);
}
break;
case BRANCH_CLOSE:
level--;
if (level < 0)
exception("Unexpected branch closing");
token = nextToken();
if (token.getType() == TokenType.NAME
|| token.getType() == TokenType.TWO_COLON)
token = parseNodeName(root, token, names);
if (!nodeStack.isEmpty()) {
node = root;
root = nodeStack.pop();
root.children.set(root.children.size() - 1, node);
}
switch (token.getType()) {
case COMMA: state = ParserState.BRANCH_ADD; break;
case PAR_CLOSE: state = ParserState.BRANCH_CLOSE; break;
case COLON_COMMA: state = ParserState.TREE_END; break;
default: exception("Unexpected token: " + token);
}
break;
case TREE_END:
if (level != 0)
exception("Unexpected end");
// tree.setRoot(root);
token = nextTokenNoSpace();
switch (token.getType()) {
case END_OF_STREAM: state = ParserState.EOF; break;
default: exception("End of stream expected but found " + token);
}
break;
}
}
return root;
}
private Token parseNodeName(TreeNode node, Token token, Set<String> names) throws NewickParserException {
if (token.getType() == TokenType.NAME
|| token.getType() == TokenType.NUMBER) {
if (names.contains(token.getText()))
exception("Node name already defined");
node.name = getDecodedTaxaName(token.getText());
names.add(token.getText());
token = nextToken();
}
if (token.getType() == TokenType.TWO_COLON) {
token = nextToken();
if (token.getType() != TokenType.NUMBER)
exception("Number expected but found " + token);
//node.setValue(Double.parseDouble(token.getText()));
node.edgeLength = Double.parseDouble(token.getText());
token = nextTokenNoSpace();
}
return token;
}
private void exception(String msg) throws NewickParserException {
throw new NewickParserException("Line " + lastRow + " column " + lastCol + " -> " + msg);
}
private Token nextTokenNoSpace() throws NewickParserException {
Token token = nextToken();
while (token.getType() == TokenType.SPACE)
token = nextToken();
return token;
}
private Token nextToken() throws NewickParserException {
sb.setLength(0);
int ch;
char state = 'S';
boolean done = false;
try {
while (!done) {
switch (state) {
case 'S': // Start
ch = nextChar();
if (ch == -1)
return new Token(TokenType.END_OF_STREAM, null);
else if (isWhitespace((char) ch))
return new Token(TokenType.SPACE, "" + (char) ch);
else if (Character.isDigit((char) ch)) {
sb.append((char) ch);
state = 'I';
}
else if (isNameChar(ch)) {
sb.append((char) ch);
state = 'N';
}
else {
switch (ch) {
case '(': return new Token(TokenType.PAR_OPEN, "" + (char) ch);
case ')': return new Token(TokenType.PAR_CLOSE, "" + (char) ch);
case ',': return new Token(TokenType.COMMA, "" + (char) ch);
case ':': return new Token(TokenType.TWO_COLON, "" + (char) ch);
case ';': return new Token(TokenType.COLON_COMMA, "" + (char) ch);
default: exception("Unexpected character: " + (char) ch);
}
}
break;
case 'I': // Integer number
ch = nextChar();
while (ch != -1 && Character.isDigit((char) ch)) {
sb.append((char) ch);
ch = nextChar();
}
if (ch == '.') {
sb.append((char) ch);
state = 'F';
}
else if (isNameChar(ch)) {
sb.append((char) ch);
state = 'N';
}
else {
bufferedChar = ch;
return new Token(TokenType.NUMBER, sb.toString());
}
break;
case 'F': // Floating number
ch = nextChar();
while (ch != -1 && Character.isDigit((char) ch)) {
sb.append((char) ch);
ch = nextChar();
}
if (isNameChar(ch)) {
sb.append((char) ch);
state = 'N';
}
else {
bufferedChar = ch;
return new Token(TokenType.NUMBER, sb.toString());
}
break;
case 'N': // Name
ch = nextChar();
while (ch != -1 && isNameChar((char) ch)) {
sb.append((char) ch);
ch = nextChar();
}
bufferedChar = ch;
return new Token(TokenType.NAME, sb.toString());
default:
throw new NewickParserException("Internal error: Unknown tokenizer state: " + state);
}
}
}
catch (Exception e) {
throw new NewickParserException(e);
}
throw new NewickParserException("Unexpected tokenizer end");
}
private int nextChar() throws IOException {
lastRow = row;
lastCol = col;
if (bufferedChar != -1) {
int bufChar = bufferedChar;
bufferedChar = -1;
return bufChar;
}
int ch = reader.read();
if (ch == '\n') {
row++;
col = 0;
}
else
col++;
return ch;
}
private boolean isWhitespace(char ch) {
return Character.isWhitespace(ch);
}
private boolean isSpecialChar(int ch) {
return ch == '(' || ch == ')' || ch == ',' || ch == ':' || ch == ';';
}
private boolean isNameChar(int ch) {
return ch != -1 && !isSpecialChar(ch) && !isWhitespace((char) ch);
}
/**
* A utility function to encode taxa names so they do not contain any of the Newick formats
* special characters, as specified in {@link #isSpecialChar(int)}. It also removes the
* speces from the names. TODO: It does however not deal with parenthesis at the moment.
* @param taxa the taxa name to be encoded.
* @return the encoded taxa name.
*/
public static String getEncodedTaxaName(String taxa) {
String s = null;
try {
s = URLEncoder.encode(taxa.replaceAll(" ", ""), "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return s;
}
/**
* A utility function to decode taxa names that have been encoded by
* {@link #getEncodedTaxaName(String)}. It does not reverse the spaces however.
* TODO: make it deal with parenthesis.
* @param encodedTaxa
*/
public static String getDecodedTaxaName(String encodedTaxa) {
String s = null;
try {
s = URLDecoder.decode(encodedTaxa, "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return s;
}
}