NexusParser.java example

Explorer
beast2-master
- src
package beast.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import beast.core.BEASTInterface;
import beast.core.parameter.RealParameter;
import beast.core.util.Log;
import beast.evolution.alignment.Alignment;
import beast.evolution.alignment.FilteredAlignment;
import beast.evolution.alignment.Sequence;
import beast.evolution.alignment.Taxon;
import beast.evolution.alignment.TaxonSet;
import beast.evolution.datatype.DataType;
import beast.evolution.datatype.StandardData;
import beast.evolution.datatype.UserDataType;
import beast.evolution.tree.TraitSet;
import beast.evolution.tree.Tree;
import beast.math.distributions.Exponential;
import beast.math.distributions.Gamma;
import beast.math.distributions.LogNormalDistributionModel;
import beast.math.distributions.MRCAPrior;
import beast.math.distributions.Normal;
import beast.math.distributions.ParametricDistribution;
import beast.math.distributions.Uniform;


/**
 * parses nexus file and grabs alignment and calibration from the file *
 */
public class NexusParser {
    /**
     * keep track of nexus file line number, to report when the file does not parse *
     */
    int lineNr;

    /**
     * Beast II objects reconstructed from the file*
     */
    public Alignment m_alignment;
    public List<Alignment> filteredAlignments = new ArrayList<>();
    public TraitSet traitSet;
    public List<MRCAPrior> calibrations;

    public List<String> taxa;
    List<Taxon> taxonList = new ArrayList<>();
    public List<Tree> trees;

    static Set<String> g_sequenceIDs;

    public Map<String, String> translationMap = null;

    static {
        g_sequenceIDs = new HashSet<>();
    }

    public List<TaxonSet> taxonsets = new ArrayList<>();

    private List<NexusParserListener> listeners = new ArrayList<>();

    /**
     * Adds a listener for client classes that want to monitor progress of the parsing.
     * @param listener
     */
    public void addListener(final NexusParserListener listener) {
        listeners.add(listener);
    }

    /**
     * Try to parse BEAST 2 objects from the given file
     *
     * @param file the file to parse.
     */
    public void parseFile(final File file) throws IOException {
        final String fileName = file.getName().replaceAll(".*[\\/\\\\]", "").replaceAll("\\..*", "");

        parseFile(fileName, new FileReader(file));
    }

    /**
     * try to reconstruct Beast II objects from the given reader
     *
     * @param id     a name to give to the parsed results
     * @param reader a reader to parse from
     * TODO: RRB: throws IOException now instead of just Exception. 
     * java.text.ParseException seems more appropriate, but requires keeping track of the position in the file, which is non-trivial 
     */
    public void parseFile(final String id, final Reader reader) throws IOException {
        lineNr = 0;
        final BufferedReader fin;
        if (reader instanceof BufferedReader) {
            fin = (BufferedReader) reader;
        } else {
            fin = new BufferedReader(reader);
        }
        try {
            while (fin.ready()) {
                final String str = nextLine(fin);
                if (str == null) {
                    processSets();
                    return;
                }
                final String lower = str.toLowerCase();
                if (lower.matches("^\\s*begin\\s+data;\\s*$") || lower.matches("^\\s*begin\\s+characters;\\s*$")) {
                    m_alignment = parseDataBlock(fin);
                    m_alignment.setID(id);
                } else if (lower.matches("^\\s*begin\\s+calibration;\\s*$")) {
                    traitSet = parseCalibrationsBlock(fin);
                } else if (lower.matches("^\\s*begin\\s+assumptions;\\s*$") ||
                        lower.matches("^\\s*begin\\s+sets;\\s*$") ||
                        lower.matches("^\\s*begin\\s+mrbayes;\\s*$")) {
                    parseAssumptionsBlock(fin);
                } else if (lower.matches("^\\s*begin\\s+taxa;\\s*$")) {
                    parseTaxaBlock(fin);
                } else if (lower.matches("^\\s*begin\\s+trees;\\s*$")) {
                    parseTreesBlock(fin);
                }
            }
            processSets();

        } catch (TreeParser.TreeParsingException e) {
            int errorLine = lineNr + 1;

            if (e.getLineNum() != null)
                errorLine += e.getLineNum()-1;

            String errorMsg = "Encountered error interpreting the Newick string found around line " +
                    errorLine + " of the input file.";

            if (e.getCharacterNum() != null)
                errorMsg += "\nThe parser reports that the error occurred at character " + (e.getCharacterNum()+1)
                        + " of the Newick string on this line.";

            errorMsg += "\nThe parser gives the following clue:\n" + e.getMessage();

            throw new IOException(errorMsg);

        } catch (Exception e) {
            throw new IOException("Around line " + (lineNr+1) + "\n" + e.getMessage());
        }
    } // parseFile

	private void parseTreesBlock(final BufferedReader fin) throws IOException {
        trees = new ArrayList<>();
        // read to first non-empty line within trees block
        String str = readLine(fin).trim();
        while (str.equals("")) {
            str = readLine(fin).trim();
        }

        int origin = -1;

        // if first non-empty line is "translate" then parse translate block
        if (str.toLowerCase().contains("translate")) {
            translationMap = parseTranslateBlock(fin);
            origin = getIndexedTranslationMapOrigin(translationMap);
            if (origin != -1) {
                taxa = getIndexedTranslationMap(translationMap, origin);
            }
        }

        // read trees
        while (str != null) {
            if (str.toLowerCase().startsWith("tree ")) {
                final int i = str.indexOf('(');
                if (i > 0) {
                    str = str.substring(i);
                }
                TreeParser treeParser;

                if (origin != -1) {
                    treeParser = new TreeParser(taxa, str, origin, false);
                } else {
                    try {
                        treeParser = new TreeParser(taxa, str, 0, false);
                    } catch (ArrayIndexOutOfBoundsException e) {
                        treeParser = new TreeParser(taxa, str, 1, false);
                    }
                }
//                catch (NullPointerException e) {
//                    treeParser = new TreeParser(m_taxa, str, 1);
//                }


                if (translationMap != null) treeParser.translateLeafIds(translationMap);


                // this needs to go after translation map or listeners have an incomplete tree!
                for (final NexusParserListener listener : listeners) {
                    listener.treeParsed(trees.size(), treeParser);
                }

                // this must come after listener or trees.size() gives the wrong index to treeParsed
                trees.add(treeParser);

//				Node tree = treeParser.getRoot();
//				tree.sort();
//				tree.labelInternalNodes(nrOfLabels);
            }
            str = fin.readLine();
            if (str != null) str = str.trim();
        }
    }

    private List<String> getIndexedTranslationMap(final Map<String, String> translationMap, final int origin) {

        Log.warning.println("translation map size = " + translationMap.size());

        final String[] taxa = new String[translationMap.size()];

        for (final String key : translationMap.keySet()) {
            taxa[Integer.parseInt(key) - origin] = translationMap.get(key);
        }
        return Arrays.asList(taxa);
    }

    /**
     * @param translationMap
     * @return minimum key value if keys are a contiguous set of integers starting from zero or one, -1 otherwise
     */
    private int getIndexedTranslationMapOrigin(final Map<String, String> translationMap) {

        final SortedSet<Integer> indices = new TreeSet<>();

        int count = 0;
        for (final String key : translationMap.keySet()) {
            final int index = Integer.parseInt(key);
            indices.add(index);
            count += 1;
        }
        if ((indices.last() - indices.first() == count - 1) && (indices.first() == 0 || indices.first() == 1)) {
            return indices.first();
        }
        return -1;
    }

    /**
     * @param reader a reader
     * @return a map of taxa translations, keys are generally integer node number starting from 1
     *         whereas values are generally descriptive strings.
     * @throws IOException
     */
    private Map<String, String> parseTranslateBlock(final BufferedReader reader) throws IOException {

        final Map<String, String> translationMap = new HashMap<>();

        String line = readLine(reader);
        final StringBuilder translateBlock = new StringBuilder();
        while (line != null && !line.trim().toLowerCase().equals(";")) {
            translateBlock.append(line.trim());
            line = readLine(reader);
        }
        final String[] taxaTranslations = translateBlock.toString().split(",");
        for (final String taxaTranslation : taxaTranslations) {
            final String[] translation = taxaTranslation.split("[\t ]+");
            if (translation.length == 2) {
                translationMap.put(translation[0], translation[1]);
//                Log.info.println(translation[0] + " -> " + translation[1]);
            } else {
                Log.warning.println("Ignoring translation:" + Arrays.toString(translation));
            }
        }
        return translationMap;
    }

    private void parseTaxaBlock(final BufferedReader fin) throws IOException {
        taxa = new ArrayList<>();
        int expectedTaxonCount = -1;
        String str;
        do {
            str = nextLine(fin);
            if (str.toLowerCase().matches("\\s*dimensions\\s.*")) {
                str = str.substring(str.toLowerCase().indexOf("ntax=") + 5);
                str = str.replaceAll(";", "");
                expectedTaxonCount = Integer.parseInt(str.trim());
            } else if (str.toLowerCase().trim().startsWith("taxlabels")) {
            	str = str.trim().substring(9).trim();
            	boolean initial = (str.equals(""));
                do {
                	if (initial) {
                        str = nextLine(fin);
                	}
                	initial = true;
                    str = str.replaceAll(";", "");
                    str = str.trim();
                    if (str.length() > 0 && !str.toLowerCase().equals("end")) {
                    	String [] strs = str.split("\\s+");
                    	for (int i = 0; i < strs.length; i++) {
                        	String taxon = strs[i];
                            if (taxon.charAt(0) == '\'' || taxon.charAt(0) == '\"') {
                            	while (i < strs.length && taxon.charAt(0) != taxon.charAt(taxon.length() - 1)) {
                            		i++;
                            		if (i == strs.length) {
                            			throw new IOException("Unclosed quote starting with " + taxon);
                            		}
                            		taxon += " " + strs[i];
                            	}
                            	taxon = taxon.substring(1, taxon.length() - 1);
                            }
                            taxa.add(taxon);
                            taxonList.add(new Taxon(taxon));
                    	}
                    }
                } while (!str.toLowerCase().replaceAll(";", "").equals("end"));
            }
        } while (!str.toLowerCase().replaceAll(";", "").equals("end"));
        if (expectedTaxonCount >= 0 && taxa.size() != expectedTaxonCount) {
            throw new IOException("Number of taxa (" + taxa.size() + ") is not equal to 'dimension' " +
            		"field (" + expectedTaxonCount + ") specified in 'taxa' block");
        }
    }

    /**
     * parse calibrations block and create TraitSet *
     */
    TraitSet parseCalibrationsBlock(final BufferedReader fin) throws IOException {
        final TraitSet traitSet = new TraitSet();
        traitSet.traitNameInput.setValue("date", traitSet);
        String str;
        do {
            str = nextLine(fin);
            if (str.toLowerCase().contains("options")) {
                String scale = getAttValue("scale", str);
                if (scale.endsWith("s")) {
                    scale = scale.substring(0, scale.length() - 1);
                }
                traitSet.unitsInput.setValue(scale, traitSet);
            }
        } while (str.toLowerCase().contains("tipcalibration"));

        String text = "";
        while (true) {
            str = nextLine(fin);
            if (str.contains(";")) {
                break;
            }
            text += str;
        }
        final String[] strs = text.split(",");
        text = "";
        for (final String str2 : strs) {
            final String[] parts = str2.split(":");
            final String date = parts[0].replaceAll(".*=\\s*", "");
            final String[] taxa = parts[1].split("\\s+");
            for (final String taxon : taxa) {
                if (!taxon.matches("^\\s*$")) {
                    text += taxon + "=" + date + ",\n";
                }
            }
        }
        text = text.substring(0, text.length() - 2);
        traitSet.traitsInput.setValue(text, traitSet);
        final TaxonSet taxa = new TaxonSet();
        taxa.initByName("alignment", m_alignment);
        traitSet.taxaInput.setValue(taxa, traitSet);

        traitSet.initAndValidate();
        return traitSet;
    } // parseCalibrations


    /**
     * parse data block and create Alignment *
     */
    public Alignment parseDataBlock(final BufferedReader fin) throws IOException {

        final Alignment alignment = new Alignment();

        String str;
        int taxonCount = -1;
        int charCount = -1;
        int totalCount = 4;
        String missing = "?";
        String gap = "-";
        // indicates character matches the one in the first sequence
        String matchChar = null;
        do {
            str = nextLine(fin);

            //dimensions ntax=12 nchar=898;
            if (str.toLowerCase().contains("dimensions")) {
            	str = getNextDataBlock(str, fin);

                final String character = getAttValue("nchar", str);
                if (character == null) {
                    throw new IOException("nchar attribute expected (e.g. 'dimensions char=123') expected, not " + str);
                }
                charCount = Integer.parseInt(character);
                final String taxa = getAttValue("ntax", str);
                if (taxa != null) {
                    taxonCount = Integer.parseInt(taxa);
                }
            } else if (str.toLowerCase().contains("format")) {
            	str = getNextDataBlock(str, fin);

                //format datatype=dna interleave=no gap=-;
                final String dataTypeName = getAttValue("datatype", str);
                final String symbols;
                if (getAttValue("symbols", str) == null) {
                    symbols = getAttValue("symbols", str);
                } else {
                    symbols = getAttValue("symbols", str).replaceAll("\\s", "");
                }
                if (dataTypeName == null) {
                    Log.warning.println("Warning: expected datatype (e.g. something like 'format datatype=dna;') not '" + str + "' Assuming integer dataType");
                    alignment.dataTypeInput.setValue("integer", alignment);
                    if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
                        totalCount = symbols.length();
                    }
                } else if (dataTypeName.toLowerCase().equals("rna") || dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) {
                    alignment.dataTypeInput.setValue("nucleotide", alignment);
                    totalCount = 4;
                } else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) {
                    alignment.dataTypeInput.setValue("aminoacid", alignment);
                    totalCount = 20;
                } else if (dataTypeName.toLowerCase().equals("standard")) {
                    alignment.dataTypeInput.setValue("standard", alignment);
                    totalCount = symbols.length();
//                    if (symbols == null || symbols.equals("01")) {
//                        alignment.dataTypeInput.setValue("binary", alignment);
//                        totalCount = 2;
//                    }  else {
//                        alignment.dataTypeInput.setValue("standard", alignment);
//                        totalCount = symbols.length();
//                    }
                } else if (dataTypeName.toLowerCase().equals("binary")) {
                    alignment.dataTypeInput.setValue("binary", alignment);
                    totalCount = 2;
                } else {
                    alignment.dataTypeInput.setValue("integer", alignment);
                    if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) {
                        totalCount = symbols.length();
                    }
                }
                final String missingChar = getAttValue("missing", str);
                if (missingChar != null) {
                    missing = missingChar;
                }
                final String gapChar = getAttValue("gap", str);
                if (gapChar != null) {
                    gap = gapChar;
                }
                matchChar = getAttValue("matchchar", str);
            }
        } while (!str.trim().toLowerCase().startsWith("matrix") && !str.toLowerCase().contains("charstatelabels"));

        if (alignment.dataTypeInput.get().equals("standard")) {
        	StandardData type = new StandardData();
            type.setInputValue("nrOfStates", totalCount);
            //type.setInputValue("symbols", symbols);
        	type.initAndValidate();
            alignment.setInputValue("userDataType", type);
        }

        //reading CHARSTATELABELS block
        if (str.toLowerCase().contains("charstatelabels")) {
            if (!alignment.dataTypeInput.get().equals("standard")) {
                throw new IllegalArgumentException("If CHARSTATELABELS block is specified then DATATYPE has to be Standard");
            }
            StandardData standardDataType = (StandardData)alignment.userDataTypeInput.get();
            int[] maxNumberOfStates = new int[] {0};
            ArrayList<String> tokens = readInCharstatelablesTokens(fin);
            ArrayList<UserDataType> charDescriptions = processCharstatelabelsTokens(tokens, maxNumberOfStates);

//            while (true) {
//                str = nextLine(fin);
//                if (str.contains(";")) {
//                    break;
//                }
//                String[] strSplit = str.split("/");
//                ArrayList<String> states = new ArrayList<>();
//
//                if (strSplit.length < 2) {
//                    charDescriptions.add(new UserDataType(strSplit[0], states));
//                    continue;
//                }
//
//                String stateStr = strSplit[1];
//
//                //add a comma at the end of the string if the last non-whitespace character is not a comma or all the
//                // characters are whitespaces in the string. Also remove whitespaces at the end of the string.
//                for (int i=stateStr.length()-1; i>=0; i--) {
//                    if (!Character.isWhitespace(stateStr.charAt(i))) {
//                        if (stateStr.charAt(i-1) != ',') {
//                            stateStr = stateStr.substring(0, i)+",";
//                            break;
//                        }
//                    }
//                    if (i==0) {
//                        stateStr = stateStr.substring(0, i)+",";
//                    }
//                }
//                if (stateStr.isEmpty()) {
//                    stateStr = stateStr+",";
//                }
//
//                final int WAITING=0, WORD=1, PHRASE_IN_QUOTES=2;
//                int mode =WAITING; //0 waiting for non-space letter, 1 reading a word; 2 reading a phrase in quotes
//                int begin =0, end;
//
//                for (int i=0; i< stateStr.length(); i++) {
//                    switch (mode) {
//                        case WAITING:
//                            while (stateStr.charAt(i) == ' ') {
//                                i++;
//                            }
//                            mode = stateStr.charAt(i) == '\'' ? PHRASE_IN_QUOTES : WORD;
//                            begin = i;
//                            break;
//                        case WORD:
//                            end = stateStr.indexOf(" ", begin) != -1 ? stateStr.indexOf(" ", begin) : stateStr.indexOf(",", begin);
//                            states.add(stateStr.substring(begin, end));
//                            i=end;
//                            mode = WAITING;
//                            break;
//                        case PHRASE_IN_QUOTES:
//                            end = begin;
//                            do {
//                                end = stateStr.indexOf("'", end+2);
//                            } while (stateStr.charAt(end+1) == '\'' || end == -1);
//                            if (end == -1) {
//                                Log.info.println("Incorrect description in charstatelabels. Single quote found in line ");
//                            }
//                            end++;
//                            states.add(stateStr.substring(begin, end));
//                            i=end;
//                            mode=WAITING;
//                            break;
//                        default:
//                            break;
//                    }
//                }
//                // oldTODO make strSplit[0] look nicer (remove whitespaces and may be numbers at the beginning)
//                charDescriptions.add(new UserDataType(strSplit[0], states));
//                maxNumberOfStates = Math.max(maxNumberOfStates, states.size());
//            }
            standardDataType.setInputValue("charstatelabels", charDescriptions);
            standardDataType.setInputValue("nrOfStates", Math.max(maxNumberOfStates[0], totalCount));
            standardDataType.initAndValidate();
            for (UserDataType dataType : standardDataType.charStateLabelsInput.get()) {
            	dataType.initAndValidate();
            }
        }

        //skipping before MATRIX block
        while (!str.toLowerCase().contains(("matrix"))) {
            str = nextLine(fin);
        }

        // read character data
        // Use string builder for efficiency
        final Map<String, StringBuilder> seqMap = new HashMap<>();
        final List<String> taxa = new ArrayList<>();
        String prevTaxon = null;
        int seqLen = 0;
        while (true) {
            str = nextLine(fin);

            int start = 0, end;
            final String taxon;
            while (Character.isWhitespace(str.charAt(start))) {
                start++;
            }
            if (str.charAt(start) == '\'' || str.charAt(start) == '\"') {
                final char c = str.charAt(start);
                start++;
                end = start;
                while (str.charAt(end) != c) {
                    end++;
                }
                taxon = str.substring(start, end);
                seqLen = 0;
                end++;
            } else {
                end = start;
                while (end < str.length() && !Character.isWhitespace(str.charAt(end))) {
                    end++;
                }
                if (end < str.length()) {
                    taxon = str.substring(start, end);
                    seqLen = 0;
                } else if ((prevTaxon == null || seqLen == charCount) && end == str.length()) {
                    taxon = str.substring(start, end);
                    seqLen = 0;
                } else {
                    taxon = prevTaxon;
                    if (taxon == null) {
                        throw new IOException("Could not recognise taxon");
                    }
                    end = start;
                }
            }
            prevTaxon = taxon;
            String data = str.substring(end);
            for (int k = 0; k < data.length(); k++) {
            	if (!Character.isWhitespace(data.charAt(k))) {
            		seqLen++;
            	}
            }
            // Do this once outside loop- save on multiple regex compilations
            //data = data.replaceAll("\\s", "");

//			String [] strs = str.split("\\s+");
//			String taxon = strs[0];
//			for (int k = 1; k < strs.length - 1; k++) {
//				taxon += strs[k];
//			}
//			taxon = taxon.replaceAll("'", "");
//			Log.warning.println(taxon);
//			String data = strs[strs.length - 1];

            data = data.replaceAll(";", "");
            if (data.trim().length() > 0) {
	            if (seqMap.containsKey(taxon)) {
	                seqMap.put(taxon, seqMap.get(taxon).append(data));
	            } else {
	                seqMap.put(taxon, new StringBuilder(data));
	                taxa.add(taxon);
	            }
            }
            if (str.contains(";")) {
                break;
            }

        }
        if (taxonCount > 0 && taxa.size() > taxonCount) {
            throw new IOException("Wrong number of taxa. Perhaps a typo in one of the taxa: " + taxa);
        }

        HashSet<String> sortedAmbiguities = new HashSet<>();
        for (final String taxon : taxa) {
        	taxonList.add(new Taxon(taxon));
            final StringBuilder bsData = seqMap.get(taxon);
            String data = bsData.toString().replaceAll("\\s", "");
            seqMap.put(taxon, new StringBuilder(data));

            //collect all ambiguities in the sequence
            List<String> ambiguities = new ArrayList<>();
            Matcher m = Pattern.compile("\\{(.*?)\\}").matcher(data);
            while (m.find()) {
                int mLength = m.group().length();
                ambiguities.add(m.group().substring(1, mLength-1));
            }

            //sort elements of ambiguity sets
            String data_without_ambiguities = data.replaceAll("\\{(.*?)\\}", "?");
            for (String amb : ambiguities) {
                List<Integer> ambInt = new ArrayList<>();
                for (int i=0; i<amb.length(); i++) {
                	char c = amb.charAt(i);
                	if (c >= '0' && c <= '9') {
                		ambInt.add(Integer.parseInt(amb.charAt(i) + ""));
                	} else {
                		// ignore
                		if (data != data_without_ambiguities) {
                			Log.warning.println("Ambiguity found in " + taxon + " that is treated as missing value");
                		}
                		data = data_without_ambiguities; 
                	}
                }
                Collections.sort(ambInt);
                String ambStr = "";
                for (int i=0; i<ambInt.size(); i++) {
                    ambStr += Integer.toString(ambInt.get(i));
                }
                sortedAmbiguities.add(ambStr);
            }

            //check the length of the sequence (treat ambiguity sets as single characters)
            if (data_without_ambiguities.length() != charCount) {
                throw new IOException(str + "\nExpected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon);
            }

            // map to standard missing and gap chars
            data = data.replace(missing.charAt(0), DataType.MISSING_CHAR);
            data = data.replace(gap.charAt(0), DataType.GAP_CHAR);

            // resolve matching char, if any
            if (matchChar != null && data.contains(matchChar)) {
                final char cMatchChar = matchChar.charAt(0);
                final String baseData = seqMap.get(taxa.get(0)).toString();
                for (int i = 0; i < data.length(); i++) {
                    if (data.charAt(i) == cMatchChar) {
                        final char cReplaceChar = baseData.charAt(i);
                        data = data.substring(0, i) + cReplaceChar + (i + 1 < data.length() ? data.substring(i + 1) : "");
                    }
                }
            }

            // Using Alignment as Map gives problems when producing XML: 
            // Sequence names are used as attribute names, producing very readable XML
            // However, since attribute names cannot start with a number or contain
            // special characters (like ":" or "]") but sequence names do contain them
            // on occasion, it is more robust to create a Sequence object for each 
            // sequence where the taxon name is stored as an XML attribute values
            // that do not have the attribute name restrictions.
//            if (alignment.dataTypeInput.get().equals("nucleotide") || 
//            	alignment.dataTypeInput.get().equals("binary")  ||
//            	alignment.dataTypeInput.get().equals("aminoacid") ) {
//            	alignment.setInputValue(taxon, data);
//            } else {
	            final Sequence sequence = new Sequence();
	            sequence.init(totalCount, taxon, data);
	            sequence.setID(generateSequenceID(taxon));
	            alignment.sequenceInput.setValue(sequence, alignment);
//            }
        }


        if (alignment.dataTypeInput.get().equals("standard")) {
            //convert sortedAmbiguities to a whitespace separated string of ambiguities
            String ambiguitiesStr = "";
            for (String amb: sortedAmbiguities) {
                ambiguitiesStr += amb + " ";
            }
            if (ambiguitiesStr.length() > 0) {
            	ambiguitiesStr = ambiguitiesStr.substring(0, ambiguitiesStr.length()-1);
            }
            alignment.userDataTypeInput.get().initByName("ambiguities", ambiguitiesStr);
        }

        alignment.initAndValidate();
        if (taxonCount > 0 && taxonCount != alignment.getTaxonCount()) {
            throw new IOException("dimensions block says there are " + taxonCount + " taxa, but there were " + alignment.getTaxonCount() + " taxa found");
        }
        return alignment;
    } // parseDataBlock

    private String getNextDataBlock(String str, BufferedReader fin) throws IOException {
        while (str.indexOf(';') < 0) {
            str += nextLine(fin);
        }
        str = str.replace(";", " ");

		if (str.toLowerCase().matches(".*matrix.*")) {
			// will only get here when there
			throw new IllegalArgumentException("Mallformed nexus file: perhaps a semi colon is missing before 'matrix'");
		}
		return str;
    }

    /**
     * parse assumptions block
     * begin assumptions;
     * charset firsthalf = 1-449;
     * charset secondhalf = 450-898;
     * charset third = 1-457\3 662-896\3;
     * end;
     * 
     * begin assumptions;
     * wtset MySoapWeights (VECTOR) = 13 13 13 50 50 88 8
     * end;
     * 
     */
    void parseAssumptionsBlock(final BufferedReader fin) throws IOException {
        String str;
        do {
            str = nextLine(fin);
            if (str.toLowerCase().matches("\\s*charset\\s.*")) {
            	// remove text in brackets (as TreeBase files are wont to contain)
                str = str.replaceAll("\\(.*\\)", "");
                // clean up spaces
                str = str.replaceAll("^\\s+", "");
                str = str.replaceAll("\\s*-\\s*", "-");
                str = str.replaceAll("\\s*\\\\\\s*", "\\\\");
                str = str.replaceAll("\\s*;", "");
                final String[] strs = str.trim().split("\\s+");
                final String id = strs[1];
                String rangeString = "";
                for (int i = 3; i < strs.length; i++) {
                    rangeString += strs[i] + " ";
                }
                rangeString = rangeString.trim().replace(' ', ',');
                final FilteredAlignment alignment = new FilteredAlignment();
                alignment.setID(id);
                alignment.alignmentInput.setValue(m_alignment, alignment);
                alignment.filterInput.setValue(rangeString, alignment);
                alignment.initAndValidate();
                filteredAlignments.add(alignment);
            } else if (str.toLowerCase().matches("\\s*wtset\\s.*")) {
            	String [] strs = str.split("=");
            	if (strs.length > 1) {
            		str = strs[strs.length - 1].trim();
            		strs = str.split("\\s+");
            		int [] weights = new int[strs.length];
            		for (int i = 0; i< strs.length; i++) {
            			weights[i] = Integer.parseInt(strs[i]);
            		}
            		if (m_alignment != null) {
            			if (weights.length != m_alignment.getSiteCount()) {
            				throw new RuntimeException("Number of weights (" + weights.length+ ") " +
            						"does not match number of sites in alignment(" + m_alignment.getSiteCount()+ ")");
            			}
            			StringBuilder weightStr = new StringBuilder();
            			for (String str2 : strs) {
            				weightStr.append(str2);
            				weightStr.append(',');
            			}
            			weightStr.delete(weightStr.length() - 1, weightStr.length());
            			m_alignment.siteWeightsInput.setValue(weightStr.toString(), m_alignment);
            			m_alignment.initAndValidate();
            		} else {
            			Log.warning.println("WTSET was specified before alignment. WTSET is ignored.");
            		}
            	}
            } else if (str.toLowerCase().matches("\\s*taxset\\s.*")) {
            	String [] strs = str.split("=");
            	if (strs.length > 1) {
            		String str0 = strs[0].trim();
            		String [] strs2 = str0.split("\\s+");
            		if (strs2.length != 2) {
            			throw new RuntimeException("expected 'taxset <name> = ...;' but did not get two words before the = sign: " + str);
            		}
            		String taxonSetName = strs2[1];
            		str0 = strs[strs.length - 1].trim();
            		if (!str0.endsWith(";")) {
            			Log.warning.println("expected 'taxset <name> = ...;' semi-colin is missing: " + str + "\n"
            					+ "Taxa from following lines may be missing.");
            		}
            		str0 = str0.replaceAll(";", "");
            		String [] taxonNames = str0.split("\\s+");
            		TaxonSet taxonset = new TaxonSet();
            		for (String taxon : taxonNames) {
            			taxonset.taxonsetInput.get().add(new Taxon(taxon.replaceAll("'\"", "")));
            		}
            		taxonset.setID(taxonSetName.replaceAll("'\"", ""));
            		taxonsets.add(taxonset);
            	}
            } else if (str.toLowerCase().matches("^\\s*calibrate\\s.*")) {
            	// define calibration represented by an MRCAPRior, 
            	// taxon sets need to be specified earlier, but can also be a single taxon
            	// e.g.
            	// begin mrbayes;
            	// calibrate germanic = normal(1000,50)
            	// calibrate hittite = normal(3450,100)
            	// calibrate english = fixed(0)
            	// end;
            	String [] strs = str.split("=");
            	if (strs.length > 1) {
            		String str0 = strs[0].trim();
            		String [] strs2 = str0.split("\\s+");
            		if (strs2.length != 2) {
            			throw new RuntimeException("expected 'calibrate <name> = ...' but did not get two words before the = sign: " + str);
            		}
            		// first, get the taxon
            		String taxonSetName = strs2[1].replaceAll("'\"", "");
            		TaxonSet taxonset = null;
            		for (Taxon t : taxonsets) {
            			if (t.getID().equals(taxonSetName) && t instanceof TaxonSet) {
            				taxonset = (TaxonSet) t;
            			}
            		}
            		if (taxonset == null) {
            			// perhaps it is a singleton
            			for (Taxon t : taxonList) {
            				if (t.getID().equals(taxonSetName)) {
            					taxonset = new TaxonSet();
            					taxonset.setID(t.getID() + ".leaf");
            					taxonset.taxonsetInput.setValue(t, taxonset);
            				}
            			}
            		}
            		if (taxonset == null) {
            			throw new RuntimeException("Could not find taxon/taxonset " + taxonSetName + " in calibration: " + str);
            		}
            		
            		// next get the calibration
            		str0 = strs[strs.length - 1].trim();
            		String [] strs3 = str0.split("[\\(,\\)]");
            		RealParameter [] param = new RealParameter[strs3.length];
            		for (int i = 1; i < strs3.length; i++) {
            			try {
            				param[i] = new RealParameter(strs3[i]);
            				param[i].setID("param." + i);
            			} catch (Exception  e) {
							// ignore parsing errors
						}
            		}
            		ParametricDistribution distr  = null;
            		switch (strs3[0]) {
            		case "normal":
            			distr = new Normal();
            			distr.initByName("mean", param[1], "sigma", param[2]);
            			distr.setID("Normal.0");
            			break;
            		case "uniform":
            			distr = new Uniform();
            			distr.initByName("lower", strs3[1], "upper", strs3[2]);
            			distr.setID("Uniform.0");
            			break;
            		case "fixed":
            			// uniform with lower == upper
            			distr = new Normal();
            			distr.initByName("mean", param[1], "sigma", "+Infinity");
            			distr.setID("Normal.0");
            			break;
            		case "offsetlognormal":
            			distr = new LogNormalDistributionModel();
            			distr.initByName("offset", strs3[1], "M", param[2], "S", param[3], "meanInRealSpace", true);
            			distr.setID("LogNormalDistributionModel.0");
            			break;
            		case "lognormal":
            			distr = new LogNormalDistributionModel();
            			distr.initByName("M", param[1], "S", param[2], "meanInRealSpace", true);
            			distr.setID("LogNormalDistributionModel.0");
            			break;
            		case "offsetexponential":
            			distr = new Exponential();
            			distr.initByName("offset", strs3[1], "mean", param[2]);
            			distr.setID("Exponential.0");
            			break;
            		case "gamma":
            			distr = new Gamma();
            			distr.initByName("alpha", param[1], "beta", param[2]);
            			distr.setID("Gamma.0");
            			break;
            		case "offsetgamma":
            			distr = new Gamma();
            			distr.initByName("offset", strs3[1], "alpha", param[2], "beta", param[3]);
            			distr.setID("Gamma.0");
            			break;
            		default:
            			throw new RuntimeException("Unknwon distribution "+ strs3[0] +"in calibration: " + str);
            		}
            		MRCAPrior prior = new MRCAPrior();
            		prior.isMonophyleticInput.setValue(true, prior);
            		prior.distInput.setValue(distr, prior);
            		prior.taxonsetInput.setValue(taxonset, prior);
            		prior.setID(taxonset.getID() + ".prior");
            		// should set Tree before initialising, but we do not know the tree yet...
            		if (calibrations == null) {
            			calibrations = new ArrayList<>();
            		}
            		calibrations.add(prior);
            	}
            }


        } while (!str.toLowerCase().contains("end;"));
    }

    
    private void processSets() {
    	// create monophyletic MRCAPrior for each taxon set that 
    	// does not already have a calibration associated with it
    	for (TaxonSet taxonset : taxonsets) {
    		boolean found = false;
    		for (BEASTInterface o : taxonset.getOutputs()) {
    			if (o instanceof MRCAPrior) {
    				found = true;
    				break;
    			}
    		}
    		if (!found) {
        		MRCAPrior prior = new MRCAPrior();
        		prior.isMonophyleticInput.setValue(true, prior);
        		prior.taxonsetInput.setValue(taxonset, prior);
        		prior.setID(taxonset.getID() + ".prior");
        		// should set Tree before initialising, but we do not know the tree yet...
        		if (calibrations == null) {
        			calibrations = new ArrayList<>();
        		}
        		calibrations.add(prior);
    		}
    	}
	}


    /**
     * parse sets block
     * BEGIN Sets;
     * TAXSET 'con' = 'con_SL_Gert2' 'con_SL_Tran6' 'con_SL_Tran7' 'con_SL_Gert6';
     * TAXSET 'spa' = 'spa_138a_Cerb' 'spa_JB_Eyre1' 'spa_JB_Eyre2';
     * END; [Sets]
     */
    void parseSetsBlock(final BufferedReader fin) throws IOException {
        String str;
        do {
            str = nextLine(fin);
            if (str.toLowerCase().matches("\\s*taxset\\s.*")) {
            	String [] strs = str.split("=");
            	if (strs.length > 1) {
            		String str0 = strs[0].trim();
            		String [] strs2 = str0.split("\\s+");
            		if (strs2.length != 2) {
            			throw new RuntimeException("expected 'taxset <name> = ...;' but did not get two words before the = sign: " + str);
            		}
            		String taxonSetName = strs2[1];
            		str0 = strs[strs.length - 1].trim();
            		if (!str0.endsWith(";")) {
            			Log.warning.println("expected 'taxset <name> = ...;' semi-colin is missing: " + str + "\n"
            					+ "Taxa from following lines may be missing.");
            		}
            		str0 = str0.replaceAll(";", "");
            		String [] taxonNames = str0.split("\\s+");
            		TaxonSet taxonset = new TaxonSet();
            		for (String taxon : taxonNames) {
            			taxonset.taxonsetInput.get().add(new Taxon(taxon.replaceAll("'\"", "")));
            		}
            		taxonset.setID(taxonSetName.replaceAll("'\"", ""));
            		taxonsets.add(taxonset);
            	}
            }
        } while (!str.toLowerCase().contains("end;"));
    }

    public static String generateSequenceID(final String taxon) {
        String id = "seq_" + taxon;
        int i = 0;
        while (g_sequenceIDs.contains(id + (i > 0 ? i : ""))) {
            i++;
        }
        id = id + (i > 0 ? i : "");
        g_sequenceIDs.add(id);
        return id;
    }

    /**
     * read line from nexus file *
     */
    String readLine(final BufferedReader fin) throws IOException {
        if (!fin.ready()) {
            return null;
        }
        lineNr++;
        return fin.readLine();
    }

    /**
     * read next line from nexus file that is not a comment and not empty *
     */
    String nextLine(final BufferedReader fin) throws IOException {
        String str = readLine(fin);
        if (str == null) {
            return null;
        }
        if (str.contains("[")) {
            final int start = str.indexOf('[');
            int end = str.indexOf(']', start);
            while (end < 0) {
                str += readLine(fin);
                end = str.indexOf(']', start);
            }
            str = str.substring(0, start) + str.substring(end + 1);
            if (str.matches("^\\s*$")) {
                return nextLine(fin);
            }
        }
        if (str.matches("^\\s*$")) {
            return nextLine(fin);
        }
        return str;
    }

    /**
     * return attribute value as a string *
     */
    String getAttValue(final String attribute, final String str) {
        final Pattern pattern = Pattern.compile(".*" + attribute + "\\s*=\\s*([^\\s;]+).*");
        final Matcher matcher = pattern.matcher(str.toLowerCase());
        if (!matcher.find()) {
            return null;
        }
        String att = matcher.group(1);
        if (att.startsWith("\"") && att.endsWith("\"")) {
            final int start = matcher.start(1);
            att = str.substring(start + 1, str.indexOf('"', start + 1));
        }
        return att;
    }

    private ArrayList<String> readInCharstatelablesTokens(final BufferedReader fin) throws IOException {

        ArrayList<String> tokens = new ArrayList<>();
        String token="";
        final int READING=0, OPENQUOTE=1, WAITING=2;
        int mode = WAITING;
        int numberOfQuotes=0;
        boolean endOfBlock=false;
        String str;

        while (!endOfBlock) {
            str = nextLine(fin);
            Character nextChar;
            for (int i=0; i< str.length(); i++) {
                nextChar=str.charAt(i);
                switch (mode) {
                    case WAITING:
                        if (!Character.isWhitespace(nextChar)) {
                            if (nextChar == '\'') {
                                mode=OPENQUOTE;
                            } else if (nextChar == '/' || nextChar == ',') {
                                tokens.add(nextChar.toString());
                                token="";
                            } else if (nextChar == ';') {
                                endOfBlock = true;
                            } else {
                                token=token+nextChar;
                                mode=READING;
                            }
                        }
                        break;
                    case READING:
                        if (nextChar == '\'') {
                            tokens.add(token);
                            token="";
                            mode=OPENQUOTE;
                        } else if (nextChar == '/' || nextChar == ',') {
                            tokens.add(token);
                            tokens.add(nextChar.toString());
                            token="";
                            mode=WAITING;
                        } else if (nextChar == ';') {
                            tokens.add(token);
                            endOfBlock = true;
                        } else if (Character.isWhitespace(nextChar)) {
                            tokens.add(token);
                            token="";
                            mode=WAITING;
                        } else {
                            token=token+nextChar;
                        }
                        break;
                    case OPENQUOTE:
                        if (nextChar == '\'') {
                            numberOfQuotes++;
                        } else {
                            if (numberOfQuotes % 2 == 0) {
                                for (int ind=0; ind< numberOfQuotes/2; ind++) {
                                    token=token+"'";
                                }
                                token=token+nextChar;
                            } else {
                                for (int ind=0; ind< numberOfQuotes/2; ind++) {
                                    token=token+"'";
                                }
                                tokens.add(token);
                                token="";
                                if (nextChar == '/' || nextChar == ',') {
                                    tokens.add(nextChar.toString());
                                    mode=WAITING;
                                } else if (nextChar == ';') {
                                    endOfBlock = true;
                                } else if (Character.isWhitespace(nextChar)) {
                                    mode=WAITING;
                                } else {
                                    token=token+nextChar;
                                    mode=READING;
                                }
                            }
                            numberOfQuotes=0;
                        }
                        break;
                    default:
                        break;
                }
            }
        }

        if (!tokens.get(tokens.size()-1).equals(",")) {
            tokens.add(",");
        }

        return tokens;
    }

    private ArrayList<UserDataType> processCharstatelabelsTokens(ArrayList<String> tokens, int[] maxNumberOfStates) throws IOException {

        ArrayList<UserDataType> charDescriptions = new ArrayList<>();

        final int CHAR_NR=0, CHAR_NAME=1, STATES=2;
        int mode = CHAR_NR;
        int charNumber = -1;
        String charName = "";
        ArrayList<String> states = new ArrayList<>();

        for (String token:tokens) {
            switch (mode) {
                case CHAR_NR:
                    charNumber = Integer.parseInt(token);
                    mode = CHAR_NAME;
                    break;
                case CHAR_NAME:
                    if (token.equals("/")) {
                        mode = STATES;
                    } else if (token.equals(",")) {
                        if (charNumber > charDescriptions.size()+1) {
                            throw new IOException("Character descriptions should go in the ascending order and there " +
                                    "should not be any description missing.");
                        }
                        charDescriptions.add(new UserDataType(charName, states));
                        maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size());
                        charNumber = -1;
                        charName = "";
                        states = new ArrayList<>();
                        mode = CHAR_NR;
                    } else {
                        charName = token;
                    }
                    break;
                case STATES:
                    if (token.equals(",")) {
                        if (charNumber > charDescriptions.size()+1) {
                            throw new IOException("Character descriptions should go in the ascending order and there " +
                                    "should not be any description missing.");
                        }
                        charDescriptions.add(new UserDataType(charName, states));
                        maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size());
                        charNumber = -1;
                        charName = "";
                        states = new ArrayList<>();
                        mode = CHAR_NR;
                    } else {
                        states.add(token);
                    }
                default:
                    break;
            }
        }

        return charDescriptions;

    }

    public static void main(final String[] args) {
        try {
            final NexusParser parser = new NexusParser();
            parser.parseFile(new File(args[0]));
            if (parser.taxa != null) {
                System.out.println(parser.taxa.size() + " taxa");
                System.out.println(Arrays.toString(parser.taxa.toArray(new String[parser.taxa.size()])));
            }
            if (parser.trees != null) {
                System.out.println(parser.trees.size() + " trees");
            }
            if (parser.m_alignment != null) {
                final String xml = new XMLProducer().toXML(parser.m_alignment);
                System.out.println(xml);
            }
            if (parser.traitSet != null) {
                final String xml = new XMLProducer().toXML(parser.traitSet);
                System.out.println(xml);
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } // main

} // class NexusParser