/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* TreeFileParser.java
* Copyright Remco Bouckaert remco@cs.auckland.ac.nz (C) 2011
*/
package beast.app.treeannotator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import beast.core.util.Log;
import beast.evolution.tree.Node;
import beast.evolution.tree.Tree;
public class TreeSetParser {
/**
* default tree branch length, used when that info is not in the Newick tree
**/
final static float DEFAULT_LENGTH = 0.001f;
int m_nOffset = 0;
/** labels of leafs **/
List<String> m_sLabels;
/** position information for the leafs (if available) **/
List<Float> m_fLongitude;
List<Float> m_fLatitude;
/** extreme values for position information **/
float m_fMaxLong, m_fMaxLat, m_fMinLong, m_fMinLat;
/** nr of labels in dataset **/
int m_nNrOfLabels;
/** burn in = nr of trees ignored at the start of tree file, can be set by command line option **/
int m_nBurnInPercentage = 0;
//DensiTree m_densiTree;
/** for memory saving, set to true **/
boolean m_bSurpressMetadata = true;
/** if there is no translate block. This solves issues where the taxa labels are numbers e.g. in generated tree data **/
boolean m_bIsLabelledNewick = false;
/** flag to indicate that single child nodes are allowed **/
boolean m_bAllowSingleChild = false;
public TreeSetParser(int burnInPercentage, boolean allowSingleChild) {
m_sLabels = new ArrayList<>();
m_fLongitude = new ArrayList<>();
m_fLatitude = new ArrayList<>();
m_nBurnInPercentage = Math.max(burnInPercentage, 0);
m_fMinLat = 90; m_fMinLong = 180;
m_fMaxLat = -90; m_fMaxLong = -180;
m_bAllowSingleChild = allowSingleChild;
} // c'tor
public TreeSetParser(List<String> labels, List<Float> longitude, List<Float> latitude, int burnInPercentage) {
m_sLabels = labels;
if (m_sLabels != null) {
m_bIsLabelledNewick = true;
m_nNrOfLabels = m_sLabels.size();
}
m_fLongitude = longitude;
m_fLatitude = latitude;
m_nBurnInPercentage = Math.max(burnInPercentage, 0);
m_fMinLat = 90; m_fMinLong = 180;
m_fMaxLat = -90; m_fMaxLong = -180;
}
long fileStep;
long fileRead = 0;
long fileMarked = 0;
public Node [] parseFile(String fileName) throws IOException {
//List<String> newickTrees = new List<>();
List<Node> trees = new ArrayList<>();
m_nOffset = 0;
// parse Newick tree file
File file = new File(fileName);
fileStep = Math.max(file.length() / 61, 1);
fileRead = 0;
fileMarked = 0;
BufferedReader fin = new BufferedReader(new FileReader(fileName));
int nrOfTrees = 0;
// first, sweep through the log file to determine the number of trees
while (fin.ready()) {
if (fin.readLine().toLowerCase().startsWith("tree ")) {
nrOfTrees++;
}
}
fin.close();
fin = new BufferedReader(new FileReader(fileName));
String str = readLine(fin);
// grab translate block
while (fin.ready() && str.toLowerCase().indexOf("translate") < 0) {
str = readLine(fin);
}
m_bIsLabelledNewick = false;
m_nNrOfLabels = m_sLabels.size();
boolean addLabels = (m_nNrOfLabels == 0);
if (str.toLowerCase().indexOf("translate") < 0) {
m_bIsLabelledNewick = true;
// could not find translate block, assume it is a list of Newick trees instead of Nexus file
fin.close();
fileRead = 0;
fileMarked = 0;
fin = new BufferedReader(new FileReader(fileName));
while (fin.ready() && m_nNrOfLabels == 0) {
str = readLine(fin);
fileRead += str.length();
if (str.length() > 2 && str.indexOf("(") >= 0) {
String str2 = str;
str2 = str2.substring(str2.indexOf("("));
while (str2.indexOf('[') >= 0) {
int i0 = str2.indexOf('[');
int i1 = str2.indexOf(']');
str2 = str2.substring(0, i0) + str2.substring(i1 + 1);
}
str2 = str2.replaceAll("[;\\(\\),]"," ");
str2 = str2.replaceAll(":[0-9\\.Ee-]+"," ");
String [] labels = str2.split("\\s+");
if (addLabels) {
m_nNrOfLabels = 0;
for (int i = 0; i < labels.length; i++) {
if (labels[i].length() > 0) {
m_sLabels.add(labels[i]);
m_nNrOfLabels++;
}
}
}
Node tree = parseNewick(str);
tree.sort();
tree.labelInternalNodes(m_nNrOfLabels);
trees.add(tree);
// newickTrees.add(str);
}
}
while (fin.ready()) {
str = readLine(fin);
if (str.length() > 2 && str.indexOf("(") >= 0) {
Node tree = parseNewick(str);
tree.sort();
tree.labelInternalNodes(m_nNrOfLabels);
trees.add(tree);
if (trees.size() % 100 ==0) {if (m_nNrOfLabels>=100||trees.size() % 1000 ==0) {Log.warning.print(trees.size() + " ");}}
// newickTrees.add(str);
}
}
} else {
// read tree set from file, and store in individual strings
str = readLine(fin);
//m_nNrOfLabels = 0;
boolean isLastLabel = false;
while (fin.ready() && !isLastLabel) {
if (str.indexOf(";") >= 0) {
str = str.replace(';',' ');
str = str.trim();
if (str.isEmpty()) {
break;
}
isLastLabel = true;
}
str = str.replaceAll(",", "");
str = str.replaceAll("^\\s+", "");
String[] strs = str.split("\\s+");
int labelIndex = new Integer(strs[0]).intValue();
String label = strs[1];
if (m_sLabels.size() < labelIndex) {
//m_sLabels.add("__dummy__");
m_nOffset = 1;
}
// check if there is geographic info in the name
if (label.contains("(")) {
int strIndex = label.indexOf('(');
int str2 = label.indexOf('x', strIndex);
if (str2 >= 0) {
int str3 = label.indexOf(')', str2);
if (str3 >= 0) {
float lat = Float.parseFloat(label.substring(strIndex+1, str2));// + 180;
float _long = Float.parseFloat(label.substring(str2+1, str3));// + 360)%360;
if (lat!=0 || _long!=0) {
m_fMinLat = Math.min(m_fMinLat, lat);
m_fMaxLat = Math.max(m_fMaxLat, lat);
m_fMinLong = Math.min(m_fMinLong, _long);
m_fMaxLong = Math.max(m_fMaxLong, _long);
}
while (m_fLatitude.size() < m_sLabels.size()) {
m_fLatitude.add(0f);
m_fLongitude.add(0f);
}
m_fLatitude.add(lat);
m_fLongitude.add(_long);
}
}
label = label.substring(0, label.indexOf("("));
}
if (addLabels) {
m_sLabels.add(label);
m_nNrOfLabels++;
}
if (!isLastLabel) {
str = readLine(fin);
}
}
// read trees
// read trees
int burnIn = m_nBurnInPercentage * nrOfTrees / 100;
//int k = 0;
while (fin.ready()) {
str = readLine(fin);
str = str.trim();
if (str.length() > 5) {
String tree = str.substring(0,5);
if (tree.toLowerCase().startsWith("tree ")) {
//k++;
if (burnIn <= 0) {
int i = str.indexOf('(');
if (i > 0) {
str = str.substring(i);
}
Node treeRoot = parseNewick(str);
treeRoot.sort();
treeRoot.labelInternalNodes(m_nNrOfLabels);
trees.add(treeRoot);
//if (trees.size() % 100 ==0) {if (m_nNrOfLabels>=100||trees.size() % 1000 ==0) {Log.warning.print(trees.size() + " ");}}
} else {
burnIn--;
}
}
}
}
fin.close();
}
// discard burn-in percentage
// int burnIn = m_nBurnInPercentage * trees.size() / 100;
// for (int i = 0; i < burnIn; i++) {
// trees.remove(i);
// }
// convert lengths (stored as node heights) to heights
double maxHeight = 0;
double [] heights = new double[trees.size()];
for (int i = 0; i < trees.size(); i++) {
heights[i] = lengthToHeight(trees.get(i), 0);
maxHeight = Math.max(maxHeight, heights[i]);
}
for (int i = 0; i < trees.size(); i++) {
offsetHeight(trees.get(i), heights[i]);
}
Log.warning.println();
//System.err.println("Geo: " +m_fMinLong + "x" + m_fMinLat + " " + m_fMaxLong + "x" + m_fMaxLat);
return trees.toArray(new Node[1]);
} // parseFile
int k = 0;
private String readLine(BufferedReader fin) throws IOException {
String s = fin.readLine();
fileRead += s.length();
if (fileRead > fileMarked - 10) {
TreeAnnotator.progressStream.print("*");
fileMarked += fileStep;
k++;
}
// System.err.println(fileRead + " " + fileMarked + " " + k);
return s;
}
/** move y-position of a tree with offset f **/
public void offsetHeight(Node node, double f) {
if (!node.isLeaf()) {
offsetHeight(node.getLeft(), f);
if (node.getRight() != null) {
offsetHeight(node.getRight(), f);
}
}
node.setHeight(node.getHeight() + f);
}
/** convert length to height
* and set ID of leafs
*/
private double lengthToHeight(Node node, double offSet) {
if (node.isLeaf()) {
node.setHeight(-offSet - node.getHeight());
node.setID(m_sLabels.get(node.getNr()));
return -node.getHeight();
} else {
double posY = offSet + node.getHeight();
double yMax = 0;
yMax = Math.max(yMax, lengthToHeight(node.getLeft(), posY));
if (node.getRight() != null) {
yMax = Math.max(yMax, lengthToHeight(node.getRight(), posY));
}
node.setHeight(-posY);
return yMax;
}
}
/** Try to map str into an index. First, assume it is a number.
* If that does not work, look in list of labels to see whether it is there.
*/
private int getLabelIndex(String str) {
if (!m_bIsLabelledNewick) {
try {
return Integer.parseInt(str) - m_nOffset;
} catch (Exception e) {
}
}
for (int i = 0; i < m_nNrOfLabels; i++) {
if (str.equals(m_sLabels.get(i))) {
return i;
}
}
// str may have (double) qoutes missing
for (int i = 0; i < m_nNrOfLabels; i++) {
String label = m_sLabels.get(i);
if (label.startsWith("'") && label.endsWith("'") ||
label.startsWith("\"") && label.endsWith("\"")) {
label = label.substring(1, label.length()-1);
if (str.equals(label)) {
return i;
}
}
}
// str may have extra (double) qoutes
if (str.startsWith("'") && str.endsWith("'") ||
str.startsWith("\"") && str.endsWith("\"")) {
str = str.substring(1, str.length()-1);
return getLabelIndex(str);
}
throw new IllegalArgumentException("Label '" + str + "' in Newick tree could not be identified");
}
double height(Node node) {
if (node.isLeaf()) {
return node.getLength();
} else {
return node.getLength() + Math.max(height(node.getLeft()), height(node.getRight()));
}
}
char [] m_chars;
int m_iTokenStart;
int m_iTokenEnd;
final static int COMMA = 1;
final static int BRACE_OPEN = 3;
final static int BRACE_CLOSE = 4;
final static int COLON = 5;
final static int SEMI_COLON = 8;
final static int META_DATA = 6;
final static int TEXT = 7;
final static int UNKNOWN = 0;
int nextToken() {
m_iTokenStart = m_iTokenEnd;
while (m_iTokenEnd < m_chars.length) {
// skip spaces
while (m_iTokenEnd < m_chars.length && (m_chars[m_iTokenEnd] == ' ' || m_chars[m_iTokenEnd] == '\t')) {
m_iTokenStart++;
m_iTokenEnd++;
}
if (m_chars[m_iTokenEnd] == '(') {
m_iTokenEnd++;
return BRACE_OPEN;
}
if (m_chars[m_iTokenEnd] == ':') {
m_iTokenEnd++;
return COLON;
}
if (m_chars[m_iTokenEnd] == ';') {
m_iTokenEnd++;
return SEMI_COLON;
}
if (m_chars[m_iTokenEnd] == ')') {
m_iTokenEnd++;
return BRACE_CLOSE;
}
if (m_chars[m_iTokenEnd] == ',') {
m_iTokenEnd++;
return COMMA;
}
if (m_chars[m_iTokenEnd] == '[') {
m_iTokenEnd++;
while (m_iTokenEnd < m_chars.length && m_chars[m_iTokenEnd-1] != ']') {
m_iTokenEnd++;
}
return META_DATA;
}
while (m_iTokenEnd < m_chars.length && (m_chars[m_iTokenEnd] != ' ' && m_chars[m_iTokenEnd] != '\t'
&& m_chars[m_iTokenEnd] != '(' && m_chars[m_iTokenEnd] != ')' && m_chars[m_iTokenEnd] != '['
&& m_chars[m_iTokenEnd] != ':'&& m_chars[m_iTokenEnd] != ','&& m_chars[m_iTokenEnd] != ';')) {
m_iTokenEnd++;
}
return TEXT;
}
return UNKNOWN;
}
public Node parseNewick(String str) {
try {
if (str == null || str.length() == 0) {
return null;
}
m_chars = str.toCharArray();
m_iTokenStart = str.indexOf('(');
if (m_iTokenStart < 0) {
return null;
}
m_iTokenEnd = m_iTokenStart;
Vector<Node> stack = new Vector<>();
Vector<Boolean> isFirstChild = new Vector<>();
Vector<String> metaDataString = new Vector<>();
stack.add(new Node());
isFirstChild.add(true);
stack.lastElement().setHeight(DEFAULT_LENGTH);
metaDataString.add(null);
boolean isLabel = true;
while (m_iTokenEnd < m_chars.length) {
switch (nextToken()) {
case BRACE_OPEN:
{
Node node2 = new Node();
node2.setHeight(DEFAULT_LENGTH);
stack.add(node2);
isFirstChild.add(true);
metaDataString.add(null);
isLabel = true;
}
break;
case BRACE_CLOSE:
{
if (isFirstChild.lastElement()) {
if (m_bAllowSingleChild) {
// process single child nodes
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node dummyparent = new Node();
dummyparent.setHeight(DEFAULT_LENGTH);
dummyparent.setLeft(left);
left.setParent(dummyparent);
dummyparent.setRight(null);
Node parent = stack.lastElement();
parent.setLeft(left);
left.setParent(parent);
String metaData = metaDataString.remove(metaDataString.size() - 1);
left.metaDataString = metaData;
parseMetaData(left, metaData);
break;
} else {
// don't know how to process single child nodes
throw new IllegalArgumentException("Node with single child found.");
}
}
// process multi(i.e. more than 2)-child nodes by pairwise merging.
while (isFirstChild.elementAt(isFirstChild.size()-2) == false) {
Node right = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
Node dummyparent = new Node();
dummyparent.setHeight(DEFAULT_LENGTH);
dummyparent.setLeft(left);
left.setParent (dummyparent);
dummyparent.setRight(right);
right.setParent(dummyparent);
stack.add(dummyparent);
isFirstChild.add(false);
String metaData = metaDataString.remove(metaDataString.size() - 1);
parseMetaData(left, metaData);
}
// last two nodes on stack merged into single parent node
Node right = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
String metaData = metaDataString.remove(metaDataString.size() - 1);
parseMetaData(right, metaData);
Node left = stack.lastElement();
stack.remove(stack.size()-1);
isFirstChild.remove(isFirstChild.size()-1);
metaData = metaDataString.remove(metaDataString.size() - 1);
parseMetaData(left, metaData);
Node parent = stack.lastElement();
parent.setLeft(left);
left.setParent(parent);
parent.setRight(right);
right.setParent(parent);
metaData = metaDataString.lastElement();
parseMetaData(parent, metaData);
}
break;
case COMMA:
{
Node node2 = new Node();
node2.setHeight(DEFAULT_LENGTH);
stack.add(node2);
isFirstChild.add(false);
metaDataString.add(null);
isLabel = true;
}
break;
case COLON:
isLabel = false;
break;
case TEXT:
if (isLabel) {
String label = str.substring(m_iTokenStart, m_iTokenEnd);
stack.lastElement().setNr(getLabelIndex(label));
} else {
String length = str.substring(m_iTokenStart, m_iTokenEnd);
stack.lastElement().setHeight(Float.parseFloat(length));
}
break;
case META_DATA:
if (metaDataString.lastElement() == null) {
metaDataString.set(metaDataString.size()-1, str.substring(m_iTokenStart+1, m_iTokenEnd-1));
} else {
metaDataString.set(metaDataString.size()-1, metaDataString.lastElement()
+ ("," +str.substring(m_iTokenStart+1, m_iTokenEnd-1)));
}
break;
case SEMI_COLON:
//System.err.println(stack.lastElement().toString());
parseMetaData(stack.lastElement(), metaDataString.lastElement());
return stack.lastElement();
default:
throw new IllegalArgumentException("parseNewick: unknown token");
}
}
return stack.lastElement();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException(e.getMessage() + ": " + str.substring(Math.max(0, m_iTokenStart-100), m_iTokenStart) + " >>>" + str.substring(m_iTokenStart, m_iTokenEnd) + " <<< ...");
}
//return node;
}
public void parseMetaData(Node node, String metaDataString) {
node.metaDataString = metaDataString;
if (metaDataString == null) {
return;
}
// parse by key=value pairs
int i = 0;
int start = 1;
try {
while ((i = metaDataString.indexOf('=', i)) >= 0) {
String key = metaDataString.substring(start, i).trim();
String value = null;
int k = 0;
if ((k = metaDataString.indexOf('=', i+1)) >= 0) {
int j = metaDataString.lastIndexOf(',', k);
value = metaDataString.substring(i + 1, j);
start = j + 1;
} else {
value = metaDataString.substring(i+1);
}
if (value.length() > 0 && value.charAt(0) != '{') {
try {
Double dvalue = Double.parseDouble(value);
node.setMetaData(key, dvalue);
} catch (Exception e) {
node.setMetaData(key, value);
}
} else if (value.length() > 0 && value.charAt(0) == '{' && value.charAt(value.length() - 1) == '}') {
try {
String str = value.substring(1, value.length() - 1);
String [] strs = str.split(",");
Double [] values = new Double[strs.length];
for (int j = 0; j < strs.length; j++) {
values[j] = Double.parseDouble(strs[j]);
}
node.setMetaData(key, values);
} catch (Exception e) {
node.setMetaData(key, value);
}
} else {
node.setMetaData(key, value);
}
i++;
}
} catch (Exception e) {
// TODO: handle exception
}
}
} // class TreeFileParser