TreeParser.java example

Explorer
JContextExplorer-master
- JContextExplorer
  - src
package newickTreeParsing;

import java.awt.Frame;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.WindowEvent;
import java.awt.event.WindowListener;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.Stack;
import java.util.Vector;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JList;
import javax.swing.JOptionPane;
import javax.swing.JProgressBar;

/**
 * @author James
 *
 * Parses the newick portion of a file
 * For nexus files, additional node-number mapping is needed to rename files
 * Identification of a file as either newick or nexus determines contents
 * 
 * */
public class TreeParser
{
	
	//fields
	public double support = -1.0;
	
	/** Nexus file identifier.  We look for this as the first token to identify a tree file as Nexus, or other. */
	private static final String nexusFileID = "#NEXUS";
	/** Begin tag. */
	private static final String beginTag = "begin";
	/** End tag. */
	private static final String endTag = "end";
    //  trees section
	/** Tree section. */
	private static final String treeSectionTag = "trees";
	/** Tree ID. */
	private static final String treeID = "tree";
	/** Tree ID (same or similar to {@link #treeID}?). */
	private static final String utreeID = "utree"; // two different tree IDs?
    
	/** Line (and tree information) termination. */
	private static final char lineTerminator = ';';
	/** Equality sign. */
	private static final char equals = '=';
	/** Nexus comment open. */
	private static final char commentOpen = '[';
	/** Nexus comment close. */
	private static final char commentClose = ']';

	/**
	 * True: show debug output.  False: suppress printing.
	 */
	private static boolean debugOutput = false;
    private StreamTokenizer tokenizer;
    /**
     * Root node of the tree being parsed.  Must be initialized outside the tokenizer.
     */
    private TreeNode rootNode;
    
    /**
     * Guess the type of treeFile based on the presence of nexus identifiers.
     * @param fileName The name of the file.
     * @return true when file is nexus format, false if nexus strings weren't found.
     */
    public boolean isNexusFile(String fileName)
    {
        boolean returnValue = false;
        BufferedReader r;
        try
        {
            r = new BufferedReader(new FileReader(fileName));
            String line = r.readLine();
            if (line.indexOf(nexusFileID) != -1)
                returnValue = true;
            r.close();
        }
        catch (FileNotFoundException e)
        {
            System.err.println("Could not find file to identify: " + fileName);
        }
        catch (IOException e)
        {
            System.out.println("Couldn't identify file: " + fileName);
        }
        return returnValue;
    }
    
    /**
     * Parses names of trees in nexus file.
     * @param fileName Name of nexus file.
     * @return List of all tree names found in nexus file
     */
    public static ArrayList nexusFileTreeNames(String fileName)
    {
        ArrayList returnList = null;
        BufferedReader r;
        try
        {
            r = new BufferedReader(new FileReader(fileName));
            StreamTokenizer st = new StreamTokenizer(r);
            st.wordChars('#', '#');
            st.nextToken();
            returnList = new ArrayList();
            while (st.ttype != StreamTokenizer.TT_EOF)
            {
                if (st.ttype == StreamTokenizer.TT_WORD)
                {
                    if (st.sval.equalsIgnoreCase(beginTag))
                    {
                        st.nextToken();
                        if (st.ttype == StreamTokenizer.TT_WORD &&
                                st.sval.equalsIgnoreCase(treeSectionTag))
                        {
                            // found a tree section, huzzah
                            boolean endOfTreeList = false;
                            st.nextToken();
                            while (st.ttype != StreamTokenizer.TT_EOF && !endOfTreeList)
                            {
                                // expect either a tree/utree id or the end tag
                                if (st.ttype == StreamTokenizer.TT_WORD)
                                {
                                    if (st.sval.equalsIgnoreCase(endTag))
                                        endOfTreeList = true;
                                    else if (st.sval.equalsIgnoreCase(treeID) ||
                                            st.sval.equalsIgnoreCase(utreeID))
                                    {
                                        // found the start of a tree
                                        st.nextToken();
                                        if (st.ttype == StreamTokenizer.TT_WORD)
                                        {
                                            returnList.add(st.sval); // found a tree name
                                        }
                                        while (st.nextToken() != StreamTokenizer.TT_EOF &&
                                                st.ttype != ';'); // find the end of the tree
                                    }
                                }
                                else st.nextToken(); // eat a non-word while looking for first tree word
                                    
//                                    System.out.println("Not a word while looking for a tree start tag: " + st.ttype);
                            }
                        }
                        // not a tree section, find the end tag or the next start tag
                        else while (st.nextToken() != StreamTokenizer.TT_EOF &&
                                    st.ttype != StreamTokenizer.TT_WORD ||
                                    (!st.sval.equalsIgnoreCase(beginTag) && 
                                    !st.sval.equalsIgnoreCase(endTag)));
                    }
                    else
                        st.nextToken();
                }
                else
                    st.nextToken();
            }
            r.close();
        }
        catch (FileNotFoundException e)
        {
            System.err.println("Could not find file to identify: " + fileName);
        }
        catch (IOException e)
        {
            System.out.println("Couldn't identify file: " + fileName);
        }
        return returnList;
    }
    
    /**
     * For listing the selected results from the list of Nexus tree names.
     */
    private JList selection;
    /**
     * For returning results inside action listeners.
     */
    private Vector returnVector;
    /**
     * Frame for selecting Nexus trees from the full set.
     */
    private JFrame selectionFrame = null;
    /**
     * Layout and listeners for the nexus chooser.
     * Allows for the population of {@link #returnVector}.
     *
     */
    private void initNexusChooser()
    {
        selection = new JList();
        selectionFrame = new JFrame("Nexus tree selection");
        selection.setDragEnabled(true);
        selectionFrame.setLayout(new GridBagLayout());
        GridBagConstraints gbc = new GridBagConstraints();
        gbc.fill = GridBagConstraints.BOTH;
        gbc.gridx = 0;
        gbc.gridy = 0;
        gbc.gridwidth = 2;
        selectionFrame.add(selection, gbc);
        gbc.fill = GridBagConstraints.NONE;
        gbc.gridy++;
        gbc.gridwidth = 1;
        JButton ok = new JButton("Select");
        JButton cancel = new JButton("Cancel");
        ok.addActionListener( new ActionListener() { 
            public void actionPerformed(ActionEvent e)
            {
                returnVector = new Vector();
                int[] selectedList = selection.getSelectedIndices();
                for (int i = 0; i < selectedList.length; i++)
                    returnVector.add(new Integer(selectedList[i]));
                selectionFrame.setVisible(false);
            }
        });
        cancel.addActionListener( new ActionListener() { 
            public void actionPerformed(ActionEvent e)
            {
                returnVector = new Vector(); // empty vector, should load no trees
                selectionFrame.setVisible(false);
            }
        });
        selectionFrame.add(ok, gbc);
        gbc.gridx++;
        selectionFrame.add(cancel, gbc);
        selectionFrame.setSize(250, 300);
        selectionFrame.pack();
        selection.list();
        
    }
    
    /**
     * Pops a dialog to select the trees to load from a nexus file.
     * Also, wrapper for {@link #initNexusChooser()} dialog.
     * @param treeNames arraylist of tree names.
     * @return vector List of integers that correspond to the trees to load;
     * not using names since trees might have the same name?
     */
    public Vector chooseNames(ArrayList treeNames)
    {
        if (selectionFrame == null)
            initNexusChooser();
        returnVector = null;        
        selection.setListData(treeNames.toArray());
        selection.setVisible(true);
        System.out.println("selection list should have: " + treeNames.toString());
        
        selectionFrame.pack();
        selectionFrame.validate();
        selectionFrame.setVisible(true);
        while (selectionFrame.isVisible()) {try {Thread.sleep(100);} catch (InterruptedException e) {} }
        
        return returnVector;
    }
    
    /**
     * Initializes parsing of a tree by creating a tokenizer and setting default
     * properties (such as spacing, quoting characters).  
     * {@link #tokenize(long, String, JProgressBar)} is required to start the parsing.
     * @param b Buffered reader that could start in the middle of a nexus file or
     * the start of a newick file (basically the beginning of a newick tree, is run
     * for each tree in a nexus file)
     */
    public TreeParser(BufferedReader b)
    {
        tokenizer = new StreamTokenizer(b);
        tokenizer.eolIsSignificant(false);
        tokenizer.quoteChar('"');
//        tokenizer.quoteChar('\''); // TODO: check quote layering, quoted quotes
        tokenizer.wordChars('\'', '\''); // quote problem, turn this into a prime symbol?
        // 32 = space
        tokenizer.wordChars('!', '!'); // 33
        // 34 = "
        tokenizer.wordChars('#', '&'); // 35-38
        // 39-41 = '() newick
        tokenizer.wordChars('*', '+'); // 42-43
        // 44 = , newick
        tokenizer.wordChars('-', '/'); // 45-47
        // 48-59 = [0-9]:;
        tokenizer.wordChars('<', '<'); // 60
        // 61 = = nexus
        tokenizer.wordChars('>', '@'); // 62-64
        // 65-90 = [A-Z]
//        tokenizer.wordChars('[', '['); // 91 [ nexus comment character, treat as char
        // 92 = \ (esc, support esc'd spaces)
//      93 = ] nexus comment character
        tokenizer.wordChars('^', '`'); // 93-96
        // 97-122 = [a-z]
        tokenizer.wordChars('{', '~'); // 123-126
        // 127 = del
    }
    
    /**
     * Debug printout function.  Avoid using the system calls and use this, and set flag
     * {@link #debugOutput} depending on debugging or not.
     * @param s Display the string, for debugging.
     */
    public void debugOutput(String s)
    {
        if (debugOutput)
            System.out.println(s);
    }

    /**
     * Adds node at the top of the stack to the tree.  TreeNode is already created based
     * on Newick properties.
     * @param name Name of the node.
     * @param nodeStack Stack of nodes that haven't been added to the tree yet.  Nodes are popped when
     * they have names and all children are processed.
     * @return Newly added treeNode linked into the tree. 
     */
    private TreeNode popAndName(String name, Stack nodeStack)
    {
	    TreeNode topNode = (TreeNode)nodeStack.pop();
	    if (name == null)
	    {
	    	topNode.label = "";
	    	topNode.setName("");
	    }
	    else
	    {
	    	topNode.label = name;
	    	topNode.setName(name);
	    }
	    try
	    {
	    	TreeNode parent = (TreeNode) nodeStack.peek();
	    	parent.addChild(topNode);
	    }
	    catch (Exception e)
	    {
	        if (topNode != rootNode)
    	    	JOptionPane.showMessageDialog(null, "The selected file is not appropriately formatted.",
    	    			"Tree Parsing Error", JOptionPane.ERROR_MESSAGE);
	            //System.out.println("Parser error on node " + topNode);
	    }
	    topNode.setExtremeLeaves(); // sets leftmost and rightmost leaf, non-recursive
	    topNode.setNumberLeaves(); // sets number of leaves, non-recursive
	    topNode.linkNodesInPreorder();
	    topNode.linkNodesInPostorder();
	    
	    if (support != -1){
	    	topNode.setSupport(support);
	    }
	    return topNode;
    }
    
    /**
     * Newick tokenizer: converts a string (tree as a string) into a tree object.
     * The stream tokenizer should be initialized before calling this function.
     * @param fileLength Length of the file, for progress bar movements.
     * For nexus files, this would be the relative position of the next semicolon = the size of the tree in bytes.
     * @param streamName Name of the tree or file that is being loaded.  Nexus files have names ("tree <name> = ((...));", newick trees are named by file name.
     * @param progressBar Reference to a progress bar widgit, embedded perhaps in place of the new canvas for this tree.  If this is null, create a new progress bar here.
     * @return Tree parsed from the stream.
     */
    public Tree tokenize(long fileLength, String streamName,
            JProgressBar progressBar)
    {
        final char openBracket = '(', closeBracket = ')', childSeparator = ',',
        	treeTerminator = lineTerminator, quote = '\'', doubleQuote = '"', infoSeparator = ':';
        int progress = 0;
        rootNode = new TreeNode();
        Tree t = new Tree();
        t.setRootNode(rootNode);
        t.setFileName(streamName);
        Stack nodeStack = new Stack();
        nodeStack.push(rootNode);
        int thisToken;
        TreeNode lastNamed = null;
        boolean EOT = false;
        boolean nameNext = true;
        boolean ReadSupport = false;
        double SupportValue = 0.0;

        //System.out.println("Breakpoint!");
        
	try {
            while (EOT == false &&
                    (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF)
            {
            switch (thisToken)
            {
//            	case quote:
            	case doubleQuote:
            	case StreamTokenizer.TT_WORD:
            	    if (!nameNext){
            	        System.err.println("Error: didn't expect this name here: " + tokenizer.sval);
            	    	//JOptionPane.showMessageDialog(null, "The selected file is not appropriately formatted.","Tree Parsing Error", JOptionPane.ERROR_MESSAGE);
            	    }
            	    lastNamed = popAndName(tokenizer.sval, nodeStack);
            		progress += tokenizer.sval.length();
            		nameNext = false;
            		break;
            	case StreamTokenizer.TT_NUMBER:
            		if (ReadSupport){
            			//System.out.println("Support: " + tokenizer.nval);
            			SupportValue = tokenizer.nval;
            			break;
            		} else {
                    	if (nameNext)
                    	    lastNamed = popAndName(tokenizer.sval, nodeStack);
                    	else
                    	{
                    	    if (lastNamed != null){
                    	        lastNamed.setWeight(tokenizer.nval);
                    	    	lastNamed.setSupport(SupportValue);
                    	    } else{
                    	        System.err.println("Error: can't set value " + tokenizer.nval + " to a null node");
                    	    }
                    	    lastNamed = null;
                    	}
                    	progress += (new Double(tokenizer.nval).toString()).length();
                    	nameNext = false;
                    	break;
            		}

            	case infoSeparator:
            	    if (nameNext)
            	        lastNamed = popAndName(null, nodeStack);
            	    progress += 1;
            	    nameNext = false;
            	    ReadSupport = false;
            	    break;
            	case treeTerminator:
            	case StreamTokenizer.TT_EOF:
            	    if (nameNext)
            	        lastNamed = popAndName(null, nodeStack);
            	    EOT = true;
            	    progress += 1;
            	    nameNext = false;
            	    break;
            	case openBracket:
            	    nodeStack.push(new TreeNode());
            	    progress += 1;
            	    nameNext = true;
            	    break;
            	case closeBracket:
            	    if (nameNext)
            	        lastNamed = popAndName(null, nodeStack);
            	    progress += 1;
            	    nameNext = true;
            	    ReadSupport = true;
            	    break;
            	case childSeparator:
            	    if (nameNext)
            	        lastNamed = popAndName(null, nodeStack);
            	    nodeStack.push(new TreeNode());
            	    progress += 1;
            	    nameNext = true;
            	    break;
            	default:
            	    debugOutput("default " + (char)thisToken);
            		break;
            }
        }
        }
        catch (IOException e) {
        }
        if (!nodeStack.isEmpty())
            System.err.println("Node stack still has " + nodeStack.size() + " things");
        
        t.postProcess();
        
        return t;
    }
    /**
     * Nexus taxa tokenizer, does nothing for now, but can be used later.
     *
     */
    private void nexusTaxaTokenize()
    {
        // taxa section stuff, we might be able to just throw this away, these are replicated everywhere else
        final String dimensionID = "dimensions", taxLabelID = "taxlabels";
    }
    
    /**
     * Tokenize the tree section of a nexus file only, uses newick tokenizer.
     * @param treeNumbers Vector of Integers for commandline-based input of nexus trees; assume this vector is in ascending order
     * @return arraylist of trees parsed from the tree file.
     */
    private ArrayList nexusTreeTokenize(Vector treeNumbers)
    {
        ArrayList treeArray = new ArrayList();
        final String
        titleTag = "title", linkTag = "link", translateTag = "translate";
        // newick tree subsection stuff (newick encoding)
        
        debugOutput("tokenizing tree section");
        boolean readAllTrees = true;
        boolean treeSectionEnd = false;
        boolean nextTreeID = false;
        int nextNumber = -1;
        int thisToken;
        int currTree = 0;
        String currTreeName = null;
        if (treeNumbers != null && treeNumbers.size() > 0)
        {
            readAllTrees = false;
            nextNumber = ((Integer)treeNumbers.get(0)).intValue();
            treeNumbers.remove(0);
        }
        while ((readAllTrees || nextNumber != -1) &&
                !treeSectionEnd)
            try
            {
                while (!treeSectionEnd &&
                        (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF)
                {
                    switch (thisToken)
                    {
                    	case StreamTokenizer.TT_WORD:
                    	    if (nextTreeID)
                    	    {
                    	        currTreeName = tokenizer.sval;
                    	        debugOutput("found tree ID: " + currTreeName);
                    	        nextTreeID = false;
                    	    }
                    	    else if (tokenizer.sval.equalsIgnoreCase(treeID) ||
                    	            tokenizer.sval.equalsIgnoreCase(utreeID))
                    	    {
                    	        debugOutput("new tree"); 
                    	        nextTreeID = true; // tree tag found, next word is a tree name
                    	    }
                    	    else if (tokenizer.sval.equalsIgnoreCase(endTag))
                    	        treeSectionEnd = true;
//                    	    debugOutput("TWord: " + tokenizer.sval);
                    	break;
                    	case equals:
                    	{
                            if (treeNumbers == null || currTree == nextNumber)
                            {
                                Tree t = tokenize(0, currTreeName, null);
                                treeArray.add(t);
                                if (treeNumbers != null && !treeNumbers.isEmpty())
                                {
                                    nextNumber = ((Integer)treeNumbers.get(0)).intValue();
                                    treeNumbers.remove(0);
                                }
                                else
                                    nextNumber = -1;
                            }
                            currTree++;
                    	}
                    	    break; // eat the equals
                    	case commentOpen:
                    	    debugOutput("TEating comment");
                    	    while (thisToken != StreamTokenizer.TT_EOF && thisToken != commentClose)
                    	    {
                    	        thisToken = tokenizer.nextToken(); // eat the comments
                    	    }
                    	break;
                    	default:
                    	    debugOutput("Tdefault " + (char)thisToken);
                    		break;
                    }
                }
            }
            catch (IOException e)
            {
                System.err.println("Nexus tokenizer error: " + e);
            }
            return treeArray;
    }
    
    /**
     * Tokenize the character section of a nexus file only.  Does nothing for now, but
     * can be extended to handle sequences, for example.
     */
    private void nexusCharacterTokenize()
    {
//      character section stuff, for sequence encodings and such
        //  (this parser may later extend to cover sequences)
        final String
        formatID = "format", numTaxaID = "ntax", numCharID = "nchar",
        dataTypeID = "datatype", gapID = "gap", missingID = "missing", matrixID = "matrix";
    }
    
    /**
     * Tokenize a nexus file, uses newick tokenizer after identifying the region with the tree information.
     * @param treeNumbers Vector of Integers for commandline-based input of nexus trees; assume this vector is in ascending order.
     * @return arraylist of trees parsed from the nexus file.
     */
    public ArrayList nexusTokenize(Vector treeNumbers, JProgressBar progressBar)
    {
        System.out.println("Nexus tokenize: " + treeNumbers.toString());
        ArrayList treeArray = null;
        // Nexus string externalization: all strings are case insensitive
        final String 
    
        // the sections:
        //  characters - sequences
        characterTag = "character",
        //  taxa - a list of all taxa in this file?
        taxaTag = "taxa";
                
        boolean EOF = false;
        int thisToken;
        try
        {
            while (EOF == false &&
                    (thisToken = tokenizer.nextToken()) != StreamTokenizer.TT_EOF)
            {
                switch (thisToken)
                {
                	case StreamTokenizer.TT_WORD:
                	    if (tokenizer.sval.equalsIgnoreCase(nexusFileID)); // ignore
                	    else if (tokenizer.sval.equalsIgnoreCase(beginTag))
                	    {
                	        debugOutput("beginning new section: " + tokenizer.sval);
                	        thisToken = tokenizer.nextToken();
                	        if (tokenizer.sval.equalsIgnoreCase(treeSectionTag))
                	            treeArray = nexusTreeTokenize(treeNumbers);
                	        else if (tokenizer.sval.equalsIgnoreCase(characterTag))
                	            nexusCharacterTokenize();
                	        else if (tokenizer.sval.equalsIgnoreCase(taxaTag))
                	            nexusTaxaTokenize();
                	    }
                	    else debugOutput("Word: " + tokenizer.sval);
                	break;
                	case commentOpen:
                	    debugOutput("Eating comment");
                	    while (thisToken != StreamTokenizer.TT_EOF && thisToken != commentClose)
                	    {
                	        thisToken = tokenizer.nextToken(); // eat the comments
                	    }
                	break;
                	default:
                	    debugOutput("default " + (char)thisToken);
                		break;
                }
            }
        }
        catch (IOException e)
        {
            System.err.println("Nexus tokenizer error: " + e);
        }
        return treeArray;
    }
    
//    /**
//     * Test application function.
//     * @param args Program arguments.  Only first argument used (for filename).
//     */
//    public static void main(String[] args)
//    {
//        String fileName = args[0];
//        long start = System.currentTimeMillis();
//        File f = new File(fileName);
//        try
//        {
//            BufferedReader r = new BufferedReader(new FileReader(f));
//            TreeParser tp = new TreeParser(r);
//            Tree t = tp.tokenize(f.length(), f.getName(), null);
//        }
//        catch (FileNotFoundException e)
//        {
//            System.out.println("Couldn't find file: " + fileName);
//        }
//        System.out.println("Parsed in " + ((System.currentTimeMillis() - start)/1000.0) + " s");
//        System.exit(0);
//    }
    
   
}