/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* TreeFileParser.java
* Copyright Remco Bouckaert remco@cs.auckland.ac.nz (C) 2011
*/
package viz;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Vector;
public class TreeFileParser {
/**
* default tree branch length, used when that info is not in the Newick tree
**/
final static float DEFAULT_LENGTH = 0.001f;
int m_nOffset = 0;
/** labels of leafs **/
Vector<String> m_sLabels;
/** position information for the leafs (if available) **/
Vector<Float> m_fLongitude;
Vector<Float> m_fLatitude;
/** extreme values for position information **/
float m_fMaxLong, m_fMaxLat, m_fMinLong, m_fMinLat;
/** nr of labels in dataset **/
int m_nNrOfLabels;
/** burn in = nr of trees ignored at the start of tree file, can be set by command line option **/
int m_nBurnIn = 0;
boolean m_bBurnInIsPercentage = true;
//DensiTree m_densiTree;
/** for memory saving, set to true **/
boolean m_bSurpressMetadata = true;
/** if there is no translate block. This solves issues where the taxa labels are numbers e.g. in generated tree data **/
boolean m_bIsLabelledNewick = false;
/** flag to indicate that single child nodes are allowed **/
boolean m_bAllowSingleChild = false;
public TreeFileParser(DensiTree densiTree) {
//m_densiTree = densiTree;
m_sLabels = densiTree.m_sLabels;
m_fLongitude = densiTree.m_fLongitude;
m_fLatitude = densiTree.m_fLatitude;
m_nBurnIn = densiTree.m_nBurnIn;
m_bBurnInIsPercentage = densiTree.m_bBurnInIsPercentage;
m_fMinLat = 90; m_fMinLong = 180;
m_fMaxLat = -90; m_fMaxLong = -180;
m_bAllowSingleChild = densiTree.m_bAllowSingleChild;
} // c'tor
public TreeFileParser(Vector<String> sLabels, Vector<Float> fLongitude, Vector<Float> fLatitude, int nBurnIn) {
m_sLabels = sLabels;
if (m_sLabels != null) {
m_bIsLabelledNewick = true;
m_nNrOfLabels = m_sLabels.size();
}
m_fLongitude = fLongitude;
m_fLatitude = fLatitude;
m_nBurnIn = nBurnIn;
m_fMinLat = 90; m_fMinLong = 180;
m_fMaxLat = -90; m_fMaxLong = -180;
}
public Node [] parseFile(String sFile) throws Exception {
//Vector<String> sNewickTrees = new Vector<String>();
Vector<Node> trees = new Vector<Node>();
m_nOffset = 0;
File file = new File(sFile);
long nFileSize = file.length();
// parse Newick tree file
BufferedReader fin = new BufferedReader(new FileReader(sFile));
String sStr = fin.readLine();
nFileSize -= sStr.length();
// grab translate block
while (fin.ready() && sStr.toLowerCase().indexOf("translate") < 0) {
sStr = fin.readLine();
nFileSize -= sStr.length();
}
m_bIsLabelledNewick = false;
m_nNrOfLabels = m_sLabels.size();
boolean bAddLabels = (m_nNrOfLabels == 0);
if (sStr.toLowerCase().indexOf("translate") < 0) {
m_bIsLabelledNewick = true;
// could not find translate block, assume it is a list of Newick trees instead of Nexus file
fin.close();
fin = new BufferedReader(new FileReader(sFile));
int nBurnIn = m_nBurnIn;
if (m_bBurnInIsPercentage) {
nFileSize = file.length();
nBurnIn = (int) (m_nBurnIn * nFileSize/ 100);
}
while (fin.ready() && m_nNrOfLabels == 0) {
nFileSize = file.length();
sStr = fin.readLine();
if (m_bBurnInIsPercentage) {
nBurnIn -= sStr.length();
} else {
nBurnIn--;
}
if (sStr.length() > 2 && sStr.indexOf("(") >= 0) {
String sStr2 = sStr;
sStr2 = sStr2.substring(sStr2.indexOf("("));
while (sStr2.indexOf('[') >= 0) {
int i0 = sStr2.indexOf('[');
int i1 = sStr2.indexOf(']');
sStr2 = sStr2.substring(0, i0) + sStr2.substring(i1 + 1);
}
sStr2 = sStr2.replaceAll("[;\\(\\),]"," ");
sStr2 = sStr2.replaceAll(":\\s*[0-9\\.Ee-]+"," ");
String [] sLabels = sStr2.split("\\s+");
if (bAddLabels) {
m_nNrOfLabels = 0;
for (int i = 0; i < sLabels.length; i++) {
if (sLabels[i].length() > 0) {
m_sLabels.add(sLabels[i]);
m_nNrOfLabels++;
}
}
}
if (nBurnIn < 0) {
Node tree = parseNewick(sStr);
tree.sort();
tree.labelInternalNodes(m_nNrOfLabels);
trees.add(tree);
}
// sNewickTrees.add(sStr);
}
}
while (fin.ready()) {
sStr = fin.readLine();
if (sStr.length() > 2 && sStr.indexOf("(") >= 0) {
Node tree = parseNewick(sStr);
tree.sort();
tree.labelInternalNodes(m_nNrOfLabels);
trees.add(tree);
if (trees.size() % 100 ==0) {if (m_nNrOfLabels>=100||trees.size() % 1000 ==0) {System.err.print(trees.size() + " ");}}
// sNewickTrees.add(sStr);
}
}
} else {
// read tree set from file, and store in individual strings
sStr = fin.readLine();
nFileSize -= sStr.length();
//m_nNrOfLabels = 0;
boolean bLastLabel = false;
while (fin.ready() && !bLastLabel) {
if (sStr.indexOf(";") >= 0) {
sStr = sStr.replace(';',' ');
sStr = sStr.trim();
if (sStr.isEmpty()) {
break;
}
bLastLabel = true;
}
sStr = sStr.replaceAll(",", "");
sStr = sStr.replaceAll("^\\s+", "");
String[] sStrs = sStr.split("\\s+");
int iLabel = new Integer(sStrs[0]).intValue();
String sLabel = sStrs[1];
if (m_sLabels.size() < iLabel) {
//m_sLabels.add("__dummy__");
m_nOffset = 1;
}
// check if there is geographic info in the name
if (sLabel.contains("(")) {
int iStr = sLabel.indexOf('(');
int iStr2 = sLabel.indexOf('x', iStr);
if (iStr2 >= 0) {
int iStr3 = sLabel.indexOf(')', iStr2);
if (iStr3 >= 0) {
float fLat = Float.parseFloat(sLabel.substring(iStr+1, iStr2));// + 180;
float fLong = Float.parseFloat(sLabel.substring(iStr2+1, iStr3));// + 360)%360;
if (fLat!=0 || fLong!=0) {
m_fMinLat = Math.min(m_fMinLat, fLat);
m_fMaxLat = Math.max(m_fMaxLat, fLat);
m_fMinLong = Math.min(m_fMinLong, fLong);
m_fMaxLong = Math.max(m_fMaxLong, fLong);
}
while (m_fLatitude.size() < m_sLabels.size()) {
m_fLatitude.add(0f);
m_fLongitude.add(0f);
}
m_fLatitude.add(fLat);
m_fLongitude.add(fLong);
}
}
sLabel = sLabel.substring(0, sLabel.indexOf("("));
}
if (bAddLabels) {
m_sLabels.add(sLabel);
m_nNrOfLabels++;
}
if (!bLastLabel) {
sStr = fin.readLine();
nFileSize -= sStr.length();
}
}
// read trees
int nBurnIn = m_nBurnIn;
if (m_bBurnInIsPercentage) {
nBurnIn = (int) (m_nBurnIn * nFileSize/ 100);
}
//int k = 0;
while (fin.ready()) {
sStr = fin.readLine();
if (m_bBurnInIsPercentage) {
nBurnIn -= sStr.length();
}
sStr = sStr.trim();
if (sStr.length() > 5) {
String sTree = sStr.substring(0,5);
if (sTree.toLowerCase().startsWith("tree ")) {
//k++;
if (nBurnIn <= 0) {
int i = sStr.indexOf('(');
if (i > 0) {
sStr = sStr.substring(i);
}
// if (m_bSurpressMetadata) {
// while (sStr.indexOf('[') >= 0) {
// int i0 = sStr.indexOf('[');
// int i1 = sStr.indexOf(']');
// sStr = sStr.substring(0, i0) + sStr.substring(i1 + 1);
// }
// }
Node tree = parseNewick(sStr);
//System.err.println(k + " " + tree);
tree.sort();
tree.labelInternalNodes(m_nNrOfLabels);
trees.add(tree);
if (trees.size() % 100 ==0) {if (m_nNrOfLabels>=100||trees.size() % 1000 ==0) {System.err.print(trees.size() + " ");}}
//sNewickTrees.add(sStr);
} else {
if (!m_bBurnInIsPercentage) {
nBurnIn--;
}
}
}
}
}
fin.close();
if (nBurnIn > 0) {
System.err.println("WARNING: Burn-in too large, resetting burn-in to default");
m_sLabels.clear();
if (m_bBurnInIsPercentage) {
m_nBurnIn = 10;
} else {
m_nBurnIn = 0;
}
return parseFile(sFile);
}
}
System.err.println();
System.err.println("Geo: " +m_fMinLong + "x" + m_fMinLat + " " + m_fMaxLong + "x" + m_fMaxLat);
return trees.toArray(new Node[1]);
} // parseFile
// /**
// * helper method for parsing Newick tree. It finds the split point of the
// * tree represented by sStr
// **/
// int nextNode(String sStr, int i) {
// int nBraces = 0;
// char c = sStr.charAt(i);
// do {
// i++;
// if (i < sStr.length()) {
// c = sStr.charAt(i);
// switch (c) {
// case '(':
// nBraces++;
// break;
// case ')':
// nBraces--;
// break;
// default:
// break;
// }
// }
// } while (i < sStr.length() && (nBraces > 0 || (c != ',' && c != ')' && c != '(')));
// if (i >= sStr.length() || nBraces < 0) {
// return -1;
// } else if (sStr.charAt(i) == ')') {
// i++;
// if (sStr.charAt(i) == ':') {
// i++;
// c = sStr.charAt(i);
// while (i < sStr.length()
// && (c == '.' || c == '+' || c == '-' || Character.isDigit(c) || ((c == 'e' || c == 'E') && (sStr.charAt(i + 1) == '+' || sStr
// .charAt(i + 1) == '-')))) {
// i++;
// if (i < sStr.length()) {
// c = sStr.charAt(i);
// }
// }
// }
// }
// return i;
// } // nextNode
//
// /**
// * convert string containing Newick tree into tree data structure but only
// * in the cleaned up format (no meta data allowed)
// *
// * @param sStr
// * @return tree consisting of a Node
// */
// Node parseNewick2(String sStr) throws Exception {
// if (sStr == null || sStr.length() == 0) {
// return null;
// }
// Node node = m_densiTree.new Node();
// if (sStr.startsWith("(")) {
// int i1 = nextNode(sStr, 0);
// int i2 = nextNode(sStr, i1);
// //node.m_children = new Node[2];
// node.m_left = parseNewick(sStr.substring(1, i1));
// node.m_left.m_Parent = node;
// String sStr2 = sStr.substring(i1 + 1, (i2 > 0 ? i2 : sStr.length()));
// node.m_right = parseNewick(sStr2);
// node.m_right.m_Parent = node;
// Node node2 = null;
// if (i2 > 0 && i2 < sStr.length()-1 && sStr.charAt(i2+1)!=':'&& sStr.charAt(i2+1)!=';') {
// // looks like it is a triple split, so we need another binaray node to represent this
// int i3 = nextNode(sStr, i2);
// node2 = m_densiTree.new Node();
// String sStr3 = sStr.substring(i2 + 1, (i3 > 0 ? i3 : sStr.length()));
// node2.m_left = parseNewick(sStr3);
// node2.m_left.m_Parent = node2;
// node2.m_right = node;
// node2.m_fLength = 0;
// }
//
// if (sStr.lastIndexOf('[') > sStr.lastIndexOf(')')) {
// sStr = sStr.substring(sStr.lastIndexOf('['));
// i2 = sStr.indexOf(']');
// if (i2 < 0) {
// throw new Exception("unbalanced square bracket found:" + sStr);
// }
// sStr2 = sStr.substring(1, i2);
// node.m_sMetaData = sStr2;
// }
// if (sStr.lastIndexOf(':') > sStr.lastIndexOf(')')) {
// sStr = sStr.substring(sStr.lastIndexOf(':'));
// sStr = sStr.replaceAll("[,\\):;]", "");
// try {
// node.m_fLength = Float.parseFloat(sStr);
// } catch (Exception e) {
// node.m_fLength = (float) DEFAULT_LENGTH;
// }
// } else {
// node.m_fLength = (float) DEFAULT_LENGTH;
// }
//
// if (node2 != null) {
// node.m_Parent = node2;
// float h1 = height(node.m_left);
// float h2 = height(node.m_right);
// float h3 = height(node2.m_left);
// if (h1 > h2 && h1 > h3) {
// Node n3 = node2.m_left;
// node2.m_left = node.m_left;
// node2.m_left.m_Parent = node2;
// node.m_left = n3;
// node.m_left.m_Parent = node;
// node.m_fLength = (h1-(h1+h2+h3)/3.0f)/2.0f;
// node2.m_left.m_fLength -= node.m_fLength;
// } else if (h3 > h1 && h3 > h2) {
// node.m_fLength = (h3-(h1+h2+h3)/3.0f)/2.0f;
// node2.m_left.m_fLength -= node.m_fLength;
// node = node2;
// } else {
// // h2 is largest of the lot
// Node n3 = node2.m_left;
// node2.m_left = node.m_right;
// node2.m_left.m_Parent = node2;
// node.m_right = n3;
// node.m_right.m_Parent = node;
// node.m_fLength = (h2-(h1+h2+h3)/3.0f)/2.0f;
// node2.m_left.m_fLength -= node.m_fLength;
// }
// node = node2;
// }
// } else {
// // it is a leaf
// if (sStr.contains("[")) {
// // grab metadata
// int i1 = sStr.indexOf('[');
// int i2 = sStr.indexOf(']');
// if (i2 < 0) {
// throw new Exception("unbalanced square bracket found:" + sStr);
// }
// String sStr2 = sStr.substring(i1 + 1, i2);
// sStr = sStr.substring(0, i1) + sStr.substring(i2 + 1);
// node.m_sMetaData = sStr2;
// }
// if (sStr.indexOf(')') >= 0) {
// sStr = sStr.substring(0, sStr.indexOf(')'));
// }
// sStr = sStr.replaceFirst("[,\\);]", "");
// if (sStr.length() > 0) {
// if (sStr.indexOf(':') >= 0) {
// int iColon = sStr.indexOf(':');
// node.m_iLabel = getLabelIndex(sStr.substring(0, iColon));
// if (sStr.indexOf(':', iColon + 1) >= 0) {
// int iColon2 = sStr.indexOf(':', iColon + 1);
// node.m_fLength = Float.parseFloat(sStr.substring(iColon + 1, iColon2));
// } else {
// node.m_fLength = Float.parseFloat(sStr.substring(iColon + 1));
// }
// } else {
// node.m_iLabel = getLabelIndex(sStr);
// node.m_fLength = 1;
// }
// } else {
// return null;
// }
// }
// return node;
// } // parseNewick
/** Try to map sStr into an index. First, assume it is a number.
* If that does not work, look in list of labels to see whether it is there.
*/
private int getLabelIndex(String sStr) throws Exception {
if (!m_bIsLabelledNewick) {
try {
return Integer.parseInt(sStr) - m_nOffset;
} catch (Exception e) {
}
}
for (int i = 0; i < m_nNrOfLabels; i++) {
if (sStr.equals(m_sLabels.elementAt(i))) {
return i;
}
}
// sStr may have (double) qoutes missing
for (int i = 0; i < m_nNrOfLabels; i++) {
String sLabel = m_sLabels.elementAt(i);
if (sLabel.startsWith("'") && sLabel.endsWith("'") ||
sLabel.startsWith("\"") && sLabel.endsWith("\"")) {
sLabel = sLabel.substring(1, sLabel.length()-1);
if (sStr.equals(sLabel)) {
return i;
}
}
}
// sStr may have extra (double) qoutes
if (sStr.startsWith("'") && sStr.endsWith("'") ||
sStr.startsWith("\"") && sStr.endsWith("\"")) {
sStr = sStr.substring(1, sStr.length()-1);
return getLabelIndex(sStr);
}
throw new Exception("Label '" + sStr + "' in Newick tree could not be identified");
}
float height(Node node) {
if (node.isLeaf()) {
return node.m_fLength;
} else {
return node.m_fLength + Math.max(height(node.m_left), height(node.m_right));
}
}
char [] m_chars;
int m_iTokenStart;
int m_iTokenEnd;
final static int COMMA = 1;
final static int BRACE_OPEN = 3;
final static int BRACE_CLOSE = 4;
final static int COLON = 5;
final static int SEMI_COLON = 8;
final static int META_DATA = 6;
final static int TEXT = 7;
final static int UNKNOWN = 0;
int nextToken() {
m_iTokenStart = m_iTokenEnd;
while (m_iTokenEnd < m_chars.length) {
// skip spaces
while (m_iTokenEnd < m_chars.length && (m_chars[m_iTokenEnd] == ' ' || m_chars[m_iTokenEnd] == '\t')) {
m_iTokenStart++;
m_iTokenEnd++;
}
if (m_chars[m_iTokenEnd] == '(') {
m_iTokenEnd++;
return BRACE_OPEN;
}
if (m_chars[m_iTokenEnd] == ':') {
m_iTokenEnd++;
return COLON;
}
if (m_chars[m_iTokenEnd] == ';') {
m_iTokenEnd++;
return SEMI_COLON;
}
if (m_chars[m_iTokenEnd] == ')') {
m_iTokenEnd++;
return BRACE_CLOSE;
}
if (m_chars[m_iTokenEnd] == ',') {
m_iTokenEnd++;
return COMMA;
}
if (m_chars[m_iTokenEnd] == '[') {
m_iTokenEnd++;
while (m_iTokenEnd < m_chars.length && m_chars[m_iTokenEnd-1] != ']') {
m_iTokenEnd++;
}
return META_DATA;
}
while (m_iTokenEnd < m_chars.length && (m_chars[m_iTokenEnd] != ' ' && m_chars[m_iTokenEnd] != '\t'
&& m_chars[m_iTokenEnd] != '(' && m_chars[m_iTokenEnd] != ')' && m_chars[m_iTokenEnd] != '['
&& m_chars[m_iTokenEnd] != ':'&& m_chars[m_iTokenEnd] != ','&& m_chars[m_iTokenEnd] != ';')) {
m_iTokenEnd++;
}
return TEXT;
}
return UNKNOWN;
}
public Node parseNewick(String sStr) throws Exception {
try {
if (sStr == null || sStr.length() == 0) {
return null;
}
m_chars = sStr.toCharArray();
m_iTokenStart = sStr.indexOf('(');
if (m_iTokenStart < 0) {
return null;
}
m_iTokenEnd = m_iTokenStart;
Vector<Node> stack = new Vector<Node>();
Vector<Boolean> isFirstChild = new Vector<Boolean>();
stack.add(new Node());
isFirstChild.add(true);
stack.lastElement().m_fLength = DEFAULT_LENGTH;
boolean bIsLabel = true;
while (m_iTokenEnd < m_chars.length) {
switch (nextToken()) {
case BRACE_OPEN:
{
Node node2 = new Node();
node2.m_fLength = DEFAULT_LENGTH;
stack.add(node2);
isFirstChild.add(true);
bIsLabel = true;
}
break;
case BRACE_CLOSE:
{
if (isFirstChild.lastElement()) {
if (m_bAllowSingleChild) {
// process single child nodes
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node dummyparent = new Node();
dummyparent.m_fLength = DEFAULT_LENGTH;
dummyparent.m_left = left;
left.m_Parent = dummyparent;
dummyparent.m_right = null;
Node parent = stack.lastElement();
parent.m_left = left;
left.m_Parent = parent;
break;
} else {
// don't know how to process single child nodes
throw new Exception("Node with single child found.");
}
}
// process multi(i.e. more than 2)-child nodes by pairwise merging.
while (isFirstChild.elementAt(isFirstChild.size()-2) == false) {
Node right = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node dummyparent = new Node();
dummyparent.m_fLength = DEFAULT_LENGTH;
dummyparent.m_left = left;
left.m_Parent = dummyparent;
dummyparent.m_right = right;
right.m_Parent = dummyparent;
stack.add(dummyparent);
isFirstChild.add(false);
}
// last two nodes on stack merged into single parent node
Node right = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node parent = stack.lastElement();
parent.m_left = left;
left.m_Parent = parent;
parent.m_right = right;
right.m_Parent = parent;
}
break;
case COMMA:
{
Node node2 = new Node();
node2.m_fLength = DEFAULT_LENGTH;
stack.add(node2);
isFirstChild.add(false);
bIsLabel = true;
}
break;
case COLON:
bIsLabel = false;
break;
case TEXT:
if (bIsLabel) {
String sLabel = sStr.substring(m_iTokenStart, m_iTokenEnd);
stack.lastElement().m_iLabel = getLabelIndex(sLabel);
} else {
String sLength = sStr.substring(m_iTokenStart, m_iTokenEnd);
stack.lastElement().m_fLength = Float.parseFloat(sLength);
}
break;
case META_DATA:
if (stack.lastElement().getMetaData() == null) {
stack.lastElement().setMetaData(sStr.substring(m_iTokenStart+1, m_iTokenEnd-1));
} else {
stack.lastElement().setMetaData(stack.lastElement().getMetaData() + ("," +sStr.substring(m_iTokenStart+1, m_iTokenEnd-1)));
}
break;
case SEMI_COLON:
//System.err.println(stack.lastElement().toString());
return stack.lastElement();
default:
throw new Exception("parseNewick: unknown token");
}
}
return stack.lastElement();
} catch (Exception e) {
System.err.println(e.getMessage() + ": " + sStr.substring(Math.max(0, m_iTokenStart-100), m_iTokenStart) + " >>>" + sStr.substring(m_iTokenStart, m_iTokenEnd) + " <<< ...");
throw new Exception(e.getMessage() + ": " + sStr.substring(Math.max(0, m_iTokenStart-100), m_iTokenStart) + " >>>" + sStr.substring(m_iTokenStart, m_iTokenEnd) + " <<< ...");
}
//return node;
}
} // class TreeFileParser