package newickTreeParsing; import java.awt.Frame; import java.awt.GridBagConstraints; import java.awt.GridBagLayout; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.WindowEvent; import java.awt.event.WindowListener; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.StreamTokenizer; import java.util.ArrayList; import java.util.EmptyStackException; import java.util.Stack; import java.util.Vector; import javax.swing.JButton; import javax.swing.JFrame; import javax.swing.JList; import javax.swing.JOptionPane; import javax.swing.JProgressBar; /** * @author James * * Parses the newick portion of a file * For nexus files, additional node-number mapping is needed to rename files * Identification of a file as either newick or nexus determines contents * * */ public class TreeParser { //fields public double support = -1.0; /** Nexus file identifier. We look for this as the first token to identify a tree file as Nexus, or other. */ private static final String nexusFileID = "#NEXUS"; /** Begin tag. */ private static final String beginTag = "begin"; /** End tag. */ private static final String endTag = "end"; // trees section /** Tree section. */ private static final String treeSectionTag = "trees"; /** Tree ID. */ private static final String treeID = "tree"; /** Tree ID (same or similar to {@link #treeID}?). */ private static final String utreeID = "utree"; // two different tree IDs? /** Line (and tree information) termination. */ private static final char lineTerminator = ';'; /** Equality sign. */ private static final char equals = '='; /** Nexus comment open. */ private static final char commentOpen = '['; /** Nexus comment close. */ private static final char commentClose = ']'; /** * True: show debug output. False: suppress printing. */ private static boolean debugOutput = false; private StreamTokenizer tokenizer; /** * Root node of the tree being parsed. Must be initialized outside the tokenizer. */ private TreeNode rootNode; /** * Guess the type of treeFile based on the presence of nexus identifiers. * @param fileName The name of the file. * @return true when file is nexus format, false if nexus strings weren't found. */ public boolean isNexusFile(String fileName) { boolean returnValue = false; BufferedReader r; try { r = new BufferedReader(new FileReader(fileName)); String line = r.readLine(); if (line.indexOf(nexusFileID) != -1) returnValue = true; r.close(); } catch (FileNotFoundException e) { System.err.println("Could not find file to identify: " + fileName); } catch (IOException e) { System.out.println("Couldn't identify file: " + fileName); } return returnValue; } /** * Parses names of trees in nexus file. * @param fileName Name of nexus file. * @return List of all tree names found in nexus file */ public static ArrayList nexusFileTreeNames(String fileName) { ArrayList returnList = null; BufferedReader r; try { r = new BufferedReader(new FileReader(fileName)); StreamTokenizer st = new StreamTokenizer(r); st.wordChars('#', '#'); st.nextToken(); returnList = new ArrayList(); while (st.ttype != StreamTokenizer.TT_EOF) { if (st.ttype == StreamTokenizer.TT_WORD) { if (st.sval.equalsIgnoreCase(beginTag)) { st.nextToken(); if (st.ttype == StreamTokenizer.TT_WORD && st.sval.equalsIgnoreCase(treeSectionTag)) { // found a tree section, huzzah boolean endOfTreeList = false; st.nextToken(); while (st.ttype != StreamTokenizer.TT_EOF && !endOfTreeList) { // expect either a tree/utree id or the end tag if (st.ttype == StreamTokenizer.TT_WORD) { if (st.sval.equalsIgnoreCase(endTag)) endOfTreeList = true; else if (st.sval.equalsIgnoreCase(treeID) || st.sval.equalsIgnoreCase(utreeID)) { // found the start of a tree st.nextToken(); if (st.ttype == StreamTokenizer.TT_WORD) { returnList.add(st.sval); // found a tree name } while (st.nextToken() != StreamTokenizer.TT_EOF && st.ttype != ';'); // find the end of the tree } } else st.nextToken(); // eat a non-word while looking for first tree word // System.out.println("Not a word while looking for a tree start tag: " + st.ttype); } } // not a tree section, find the end tag or the next start tag else while (st.nextToken() != StreamTokenizer.TT_EOF && st.ttype != StreamTokenizer.TT_WORD || (!st.sval.equalsIgnoreCase(beginTag) && !st.sval.equalsIgnoreCase(endTag))); } else st.nextToken(); } else st.nextToken(); } r.close(); } catch (FileNotFoundException e) { System.err.println("Could not find file to identify: " + fileName); } catch (IOException e) { System.out.println("Couldn't identify file: " + fileName); } return returnList; } /** * For listing the selected results from the list of Nexus tree names. */ private JList selection; /** * For returning results inside action listeners. */ private Vector returnVector; /** * Frame for selecting Nexus trees from the full set. */ private JFrame selectionFrame = null; /** * Layout and listeners for the nexus chooser. * Allows for the population of {@link #returnVector}. * */ private void initNexusChooser() { selection = new JList(); selectionFrame = new JFrame("Nexus tree selection"); selection.setDragEnabled(true); selectionFrame.setLayout(new GridBagLayout()); GridBagConstraints gbc = new GridBagConstraints(); gbc.fill = GridBagConstraints.BOTH; gbc.gridx = 0; gbc.gridy = 0; gbc.gridwidth = 2; selectionFrame.add(selection, gbc); gbc.fill = GridBagConstraints.NONE; gbc.gridy++; gbc.gridwidth = 1; JButton ok = new JButton("Select"); JButton cancel = new JButton("Cancel"); ok.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { returnVector = new Vector(); int[] selectedList = selection.getSelectedIndices(); for (int i = 0; i < selectedList.length; i++) returnVector.add(new Integer(selectedList[i])); selectionFrame.setVisible(false); } }); cancel.addActionListener( new ActionListener() { public void actionPerformed(ActionEvent e) { returnVector = new Vector(); // empty vector, should load no trees selectionFrame.setVisible(false); } }); selectionFrame.add(ok, gbc); gbc.gridx++; selectionFrame.add(cancel, gbc); selectionFrame.setSize(250, 300); selectionFrame.pack(); selection.list(); } /** * Pops a dialog to select the trees to load from a nexus file. * Also, wrapper for {@link #initNexusChooser()} dialog. * @param treeNames arraylist of tree names. * @return vector List of integers that correspond to the trees to load; * not using names since trees might have the same name? */ public Vector chooseNames(ArrayList treeNames) { if (selectionFrame == null) initNexusChooser(); returnVector = null; selection.setListData(treeNames.toArray()); selection.setVisible(true); System.out.println("selection list should have: " + treeNames.toString()); selectionFrame.pack(); selectionFrame.validate(); selectionFrame.setVisible(true); while (selectionFrame.isVisible()) {try {Thread.sleep(100);} catch (InterruptedException e) {} } return returnVector; } /** * Initializes parsing of a tree by creating a tokenizer and setting default * properties (such as spacing, quoting characters). * {@link #tokenize(long, String, JProgressBar)} is required to start the parsing. * @param b Buffered reader that could start in the middle of a nexus file or * the start of a newick file (basically the beginning of a newick tree, is run * for each tree in a nexus file) */ public TreeParser(BufferedReader b) { tokenizer = new StreamTokenizer(b); tokenizer.eolIsSignificant(false); tokenizer.quoteChar('"'); // tokenizer.quoteChar('\''); // TODO: check quote layering, quoted quotes tokenizer.wordChars('\'', '\''); // quote problem, turn this into a prime symbol? // 32 = space tokenizer.wordChars('!', '!'); // 33 // 34 = " tokenizer.wordChars('#', '&'); // 35-38 // 39-41 = '() newick tokenizer.wordChars('*', '+'); // 42-43 // 44 = , newick tokenizer.wordChars('-', '/'); // 45-47 // 48-59 = [0-9]:; tokenizer.wordChars('<', '<'); // 60 // 61 = = nexus tokenizer.wordChars('>', '@'); // 62-64 // 65-90 = [A-Z] // tokenizer.wordChars('[', '['); // 91 [ nexus comment character, treat as char // 92 = \ (esc, support esc'd spaces) // 93 = ] nexus comment character tokenizer.wordChars('^', '`'); // 93-96 // 97-122 = [a-z] tokenizer.wordChars('{', '~'); // 123-126 // 127 = del } /** * Debug printout function. Avoid using the system calls and use this, and set flag * {@link #debugOutput} depending on debugging or not. * @param s Display the string, for debugging. */ public void debugOutput(String s) { if (debugOutput) System.out.println(s); } /** * Adds node at the top of the stack to the tree. TreeNode is already created based * on Newick properties. * @param name Name of the node. * @param nodeStack Stack of nodes that haven't been added to the tree yet. Nodes are popped when * they have names and all children are processed. * @return Newly added treeNode linked into the tree. */ private TreeNode popAndName(String name, Stack nodeStack) { TreeNode topNode = (TreeNode)nodeStack.pop(); if (name == null) { topNode.label = ""; topNode.setName(""); } else { topNode.label = name; topNode.setName(name); } try { TreeNode parent = (TreeNode) nodeStack.peek(); parent.addChild(topNode); } catch (Exception e) { if (topNode != rootNode) JOptionPane.showMessageDialog(null, "The selected file is not appropriately formatted.", "Tree Parsing Error", JOptionPane.ERROR_MESSAGE); //System.out.println("Parser error on node " + topNode); } topNode.setExtremeLeaves(); // sets leftmost and rightmost leaf, non-recursive topNode.setNumberLeaves(); // sets number of leaves, non-recursive topNode.linkNodesInPreorder(); topNode.linkNodesInPostorder(); if (support != -1){ topNode.setSupport(support); } return topNode; } /** * Newick tokenizer: converts a string (tree as a string) into a tree object. * The stream tokenizer should be initialized before calling this function. * @param fileLength Length of the file, for progress bar movements. * For nexus files, this would be the relative position of the next semicolon = the size of the tree in bytes. * @param streamName Name of the tree or file that is being loaded. Nexus files have names ("tree <name> = ((...));", newick trees are named by file name. * @param progressBar Reference to a progress bar widgit, embedded perhaps in place of the new canvas for this tree. If this is null, create a new progress bar here. * @return Tree parsed from the stream. */ public Tree tokenize(long fileLength, String streamName, JProgressBar progressBar) { final char openBracket = '(', closeBracket = ')', childSeparator = ',', treeTerminator = lineTerminator, quote = '\'', doubleQuote = '"', infoSeparator = ':'; int progress = 0; rootNode = new TreeNode(); Tree t = new Tree(); t.setRootNode(rootNode); t.setFileName(streamName); Stack nodeStack = new Stack(); nodeStack.push(rootNode); int thisToken; TreeNode lastNamed = null; boolean EOT = false; boolean nameNext = true; boolean ReadSupport = false; double SupportValue = 0.0; //System.out.println("Breakpoint!"); try { while (EOT == false && (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF) { switch (thisToken) { // case quote: case doubleQuote: case StreamTokenizer.TT_WORD: if (!nameNext){ System.err.println("Error: didn't expect this name here: " + tokenizer.sval); //JOptionPane.showMessageDialog(null, "The selected file is not appropriately formatted.","Tree Parsing Error", JOptionPane.ERROR_MESSAGE); } lastNamed = popAndName(tokenizer.sval, nodeStack); progress += tokenizer.sval.length(); nameNext = false; break; case StreamTokenizer.TT_NUMBER: if (ReadSupport){ //System.out.println("Support: " + tokenizer.nval); SupportValue = tokenizer.nval; break; } else { if (nameNext) lastNamed = popAndName(tokenizer.sval, nodeStack); else { if (lastNamed != null){ lastNamed.setWeight(tokenizer.nval); lastNamed.setSupport(SupportValue); } else{ System.err.println("Error: can't set value " + tokenizer.nval + " to a null node"); } lastNamed = null; } progress += (new Double(tokenizer.nval).toString()).length(); nameNext = false; break; } case infoSeparator: if (nameNext) lastNamed = popAndName(null, nodeStack); progress += 1; nameNext = false; ReadSupport = false; break; case treeTerminator: case StreamTokenizer.TT_EOF: if (nameNext) lastNamed = popAndName(null, nodeStack); EOT = true; progress += 1; nameNext = false; break; case openBracket: nodeStack.push(new TreeNode()); progress += 1; nameNext = true; break; case closeBracket: if (nameNext) lastNamed = popAndName(null, nodeStack); progress += 1; nameNext = true; ReadSupport = true; break; case childSeparator: if (nameNext) lastNamed = popAndName(null, nodeStack); nodeStack.push(new TreeNode()); progress += 1; nameNext = true; break; default: debugOutput("default " + (char)thisToken); break; } } } catch (IOException e) { } if (!nodeStack.isEmpty()) System.err.println("Node stack still has " + nodeStack.size() + " things"); t.postProcess(); return t; } /** * Nexus taxa tokenizer, does nothing for now, but can be used later. * */ private void nexusTaxaTokenize() { // taxa section stuff, we might be able to just throw this away, these are replicated everywhere else final String dimensionID = "dimensions", taxLabelID = "taxlabels"; } /** * Tokenize the tree section of a nexus file only, uses newick tokenizer. * @param treeNumbers Vector of Integers for commandline-based input of nexus trees; assume this vector is in ascending order * @return arraylist of trees parsed from the tree file. */ private ArrayList nexusTreeTokenize(Vector treeNumbers) { ArrayList treeArray = new ArrayList(); final String titleTag = "title", linkTag = "link", translateTag = "translate"; // newick tree subsection stuff (newick encoding) debugOutput("tokenizing tree section"); boolean readAllTrees = true; boolean treeSectionEnd = false; boolean nextTreeID = false; int nextNumber = -1; int thisToken; int currTree = 0; String currTreeName = null; if (treeNumbers != null && treeNumbers.size() > 0) { readAllTrees = false; nextNumber = ((Integer)treeNumbers.get(0)).intValue(); treeNumbers.remove(0); } while ((readAllTrees || nextNumber != -1) && !treeSectionEnd) try { while (!treeSectionEnd && (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF) { switch (thisToken) { case StreamTokenizer.TT_WORD: if (nextTreeID) { currTreeName = tokenizer.sval; debugOutput("found tree ID: " + currTreeName); nextTreeID = false; } else if (tokenizer.sval.equalsIgnoreCase(treeID) || tokenizer.sval.equalsIgnoreCase(utreeID)) { debugOutput("new tree"); nextTreeID = true; // tree tag found, next word is a tree name } else if (tokenizer.sval.equalsIgnoreCase(endTag)) treeSectionEnd = true; // debugOutput("TWord: " + tokenizer.sval); break; case equals: { if (treeNumbers == null || currTree == nextNumber) { Tree t = tokenize(0, currTreeName, null); treeArray.add(t); if (treeNumbers != null && !treeNumbers.isEmpty()) { nextNumber = ((Integer)treeNumbers.get(0)).intValue(); treeNumbers.remove(0); } else nextNumber = -1; } currTree++; } break; // eat the equals case commentOpen: debugOutput("TEating comment"); while (thisToken != StreamTokenizer.TT_EOF && thisToken != commentClose) { thisToken = tokenizer.nextToken(); // eat the comments } break; default: debugOutput("Tdefault " + (char)thisToken); break; } } } catch (IOException e) { System.err.println("Nexus tokenizer error: " + e); } return treeArray; } /** * Tokenize the character section of a nexus file only. Does nothing for now, but * can be extended to handle sequences, for example. */ private void nexusCharacterTokenize() { // character section stuff, for sequence encodings and such // (this parser may later extend to cover sequences) final String formatID = "format", numTaxaID = "ntax", numCharID = "nchar", dataTypeID = "datatype", gapID = "gap", missingID = "missing", matrixID = "matrix"; } /** * Tokenize a nexus file, uses newick tokenizer after identifying the region with the tree information. * @param treeNumbers Vector of Integers for commandline-based input of nexus trees; assume this vector is in ascending order. * @return arraylist of trees parsed from the nexus file. */ public ArrayList nexusTokenize(Vector treeNumbers, JProgressBar progressBar) { System.out.println("Nexus tokenize: " + treeNumbers.toString()); ArrayList treeArray = null; // Nexus string externalization: all strings are case insensitive final String // the sections: // characters - sequences characterTag = "character", // taxa - a list of all taxa in this file? taxaTag = "taxa"; boolean EOF = false; int thisToken; try { while (EOF == false && (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF) { switch (thisToken) { case StreamTokenizer.TT_WORD: if (tokenizer.sval.equalsIgnoreCase(nexusFileID)); // ignore else if (tokenizer.sval.equalsIgnoreCase(beginTag)) { debugOutput("beginning new section: " + tokenizer.sval); thisToken = tokenizer.nextToken(); if (tokenizer.sval.equalsIgnoreCase(treeSectionTag)) treeArray = nexusTreeTokenize(treeNumbers); else if (tokenizer.sval.equalsIgnoreCase(characterTag)) nexusCharacterTokenize(); else if (tokenizer.sval.equalsIgnoreCase(taxaTag)) nexusTaxaTokenize(); } else debugOutput("Word: " + tokenizer.sval); break; case commentOpen: debugOutput("Eating comment"); while (thisToken != StreamTokenizer.TT_EOF && thisToken != commentClose) { thisToken = tokenizer.nextToken(); // eat the comments } break; default: debugOutput("default " + (char)thisToken); break; } } } catch (IOException e) { System.err.println("Nexus tokenizer error: " + e); } return treeArray; } // /** // * Test application function. // * @param args Program arguments. Only first argument used (for filename). // */ // public static void main(String[] args) // { // String fileName = args[0]; // long start = System.currentTimeMillis(); // File f = new File(fileName); // try // { // BufferedReader r = new BufferedReader(new FileReader(f)); // TreeParser tp = new TreeParser(r); // Tree t = tp.tokenize(f.length(), f.getName(), null); // } // catch (FileNotFoundException e) // { // System.out.println("Couldn't find file: " + fileName); // } // System.out.println("Parsed in " + ((System.currentTimeMillis() - start)/1000.0) + " s"); // System.exit(0); // } }