package beast.util; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import beast.core.BEASTInterface; import beast.core.parameter.RealParameter; import beast.core.util.Log; import beast.evolution.alignment.Alignment; import beast.evolution.alignment.FilteredAlignment; import beast.evolution.alignment.Sequence; import beast.evolution.alignment.Taxon; import beast.evolution.alignment.TaxonSet; import beast.evolution.datatype.DataType; import beast.evolution.datatype.StandardData; import beast.evolution.datatype.UserDataType; import beast.evolution.tree.TraitSet; import beast.evolution.tree.Tree; import beast.math.distributions.Exponential; import beast.math.distributions.Gamma; import beast.math.distributions.LogNormalDistributionModel; import beast.math.distributions.MRCAPrior; import beast.math.distributions.Normal; import beast.math.distributions.ParametricDistribution; import beast.math.distributions.Uniform; /** * parses nexus file and grabs alignment and calibration from the file * */ public class NexusParser { /** * keep track of nexus file line number, to report when the file does not parse * */ int lineNr; /** * Beast II objects reconstructed from the file* */ public Alignment m_alignment; public List<Alignment> filteredAlignments = new ArrayList<>(); public TraitSet traitSet; public List<MRCAPrior> calibrations; public List<String> taxa; List<Taxon> taxonList = new ArrayList<>(); public List<Tree> trees; static Set<String> g_sequenceIDs; public Map<String, String> translationMap = null; static { g_sequenceIDs = new HashSet<>(); } public List<TaxonSet> taxonsets = new ArrayList<>(); private List<NexusParserListener> listeners = new ArrayList<>(); /** * Adds a listener for client classes that want to monitor progress of the parsing. * @param listener */ public void addListener(final NexusParserListener listener) { listeners.add(listener); } /** * Try to parse BEAST 2 objects from the given file * * @param file the file to parse. */ public void parseFile(final File file) throws IOException { final String fileName = file.getName().replaceAll(".*[\\/\\\\]", "").replaceAll("\\..*", ""); parseFile(fileName, new FileReader(file)); } /** * try to reconstruct Beast II objects from the given reader * * @param id a name to give to the parsed results * @param reader a reader to parse from * TODO: RRB: throws IOException now instead of just Exception. * java.text.ParseException seems more appropriate, but requires keeping track of the position in the file, which is non-trivial */ public void parseFile(final String id, final Reader reader) throws IOException { lineNr = 0; final BufferedReader fin; if (reader instanceof BufferedReader) { fin = (BufferedReader) reader; } else { fin = new BufferedReader(reader); } try { while (fin.ready()) { final String str = nextLine(fin); if (str == null) { processSets(); return; } final String lower = str.toLowerCase(); if (lower.matches("^\\s*begin\\s+data;\\s*$") || lower.matches("^\\s*begin\\s+characters;\\s*$")) { m_alignment = parseDataBlock(fin); m_alignment.setID(id); } else if (lower.matches("^\\s*begin\\s+calibration;\\s*$")) { traitSet = parseCalibrationsBlock(fin); } else if (lower.matches("^\\s*begin\\s+assumptions;\\s*$") || lower.matches("^\\s*begin\\s+sets;\\s*$") || lower.matches("^\\s*begin\\s+mrbayes;\\s*$")) { parseAssumptionsBlock(fin); } else if (lower.matches("^\\s*begin\\s+taxa;\\s*$")) { parseTaxaBlock(fin); } else if (lower.matches("^\\s*begin\\s+trees;\\s*$")) { parseTreesBlock(fin); } } processSets(); } catch (TreeParser.TreeParsingException e) { int errorLine = lineNr + 1; if (e.getLineNum() != null) errorLine += e.getLineNum()-1; String errorMsg = "Encountered error interpreting the Newick string found around line " + errorLine + " of the input file."; if (e.getCharacterNum() != null) errorMsg += "\nThe parser reports that the error occurred at character " + (e.getCharacterNum()+1) + " of the Newick string on this line."; errorMsg += "\nThe parser gives the following clue:\n" + e.getMessage(); throw new IOException(errorMsg); } catch (Exception e) { throw new IOException("Around line " + (lineNr+1) + "\n" + e.getMessage()); } } // parseFile private void parseTreesBlock(final BufferedReader fin) throws IOException { trees = new ArrayList<>(); // read to first non-empty line within trees block String str = readLine(fin).trim(); while (str.equals("")) { str = readLine(fin).trim(); } int origin = -1; // if first non-empty line is "translate" then parse translate block if (str.toLowerCase().contains("translate")) { translationMap = parseTranslateBlock(fin); origin = getIndexedTranslationMapOrigin(translationMap); if (origin != -1) { taxa = getIndexedTranslationMap(translationMap, origin); } } // read trees while (str != null) { if (str.toLowerCase().startsWith("tree ")) { final int i = str.indexOf('('); if (i > 0) { str = str.substring(i); } TreeParser treeParser; if (origin != -1) { treeParser = new TreeParser(taxa, str, origin, false); } else { try { treeParser = new TreeParser(taxa, str, 0, false); } catch (ArrayIndexOutOfBoundsException e) { treeParser = new TreeParser(taxa, str, 1, false); } } // catch (NullPointerException e) { // treeParser = new TreeParser(m_taxa, str, 1); // } if (translationMap != null) treeParser.translateLeafIds(translationMap); // this needs to go after translation map or listeners have an incomplete tree! for (final NexusParserListener listener : listeners) { listener.treeParsed(trees.size(), treeParser); } // this must come after listener or trees.size() gives the wrong index to treeParsed trees.add(treeParser); // Node tree = treeParser.getRoot(); // tree.sort(); // tree.labelInternalNodes(nrOfLabels); } str = fin.readLine(); if (str != null) str = str.trim(); } } private List<String> getIndexedTranslationMap(final Map<String, String> translationMap, final int origin) { Log.warning.println("translation map size = " + translationMap.size()); final String[] taxa = new String[translationMap.size()]; for (final String key : translationMap.keySet()) { taxa[Integer.parseInt(key) - origin] = translationMap.get(key); } return Arrays.asList(taxa); } /** * @param translationMap * @return minimum key value if keys are a contiguous set of integers starting from zero or one, -1 otherwise */ private int getIndexedTranslationMapOrigin(final Map<String, String> translationMap) { final SortedSet<Integer> indices = new TreeSet<>(); int count = 0; for (final String key : translationMap.keySet()) { final int index = Integer.parseInt(key); indices.add(index); count += 1; } if ((indices.last() - indices.first() == count - 1) && (indices.first() == 0 || indices.first() == 1)) { return indices.first(); } return -1; } /** * @param reader a reader * @return a map of taxa translations, keys are generally integer node number starting from 1 * whereas values are generally descriptive strings. * @throws IOException */ private Map<String, String> parseTranslateBlock(final BufferedReader reader) throws IOException { final Map<String, String> translationMap = new HashMap<>(); String line = readLine(reader); final StringBuilder translateBlock = new StringBuilder(); while (line != null && !line.trim().toLowerCase().equals(";")) { translateBlock.append(line.trim()); line = readLine(reader); } final String[] taxaTranslations = translateBlock.toString().split(","); for (final String taxaTranslation : taxaTranslations) { final String[] translation = taxaTranslation.split("[\t ]+"); if (translation.length == 2) { translationMap.put(translation[0], translation[1]); // Log.info.println(translation[0] + " -> " + translation[1]); } else { Log.warning.println("Ignoring translation:" + Arrays.toString(translation)); } } return translationMap; } private void parseTaxaBlock(final BufferedReader fin) throws IOException { taxa = new ArrayList<>(); int expectedTaxonCount = -1; String str; do { str = nextLine(fin); if (str.toLowerCase().matches("\\s*dimensions\\s.*")) { str = str.substring(str.toLowerCase().indexOf("ntax=") + 5); str = str.replaceAll(";", ""); expectedTaxonCount = Integer.parseInt(str.trim()); } else if (str.toLowerCase().trim().startsWith("taxlabels")) { str = str.trim().substring(9).trim(); boolean initial = (str.equals("")); do { if (initial) { str = nextLine(fin); } initial = true; str = str.replaceAll(";", ""); str = str.trim(); if (str.length() > 0 && !str.toLowerCase().equals("end")) { String [] strs = str.split("\\s+"); for (int i = 0; i < strs.length; i++) { String taxon = strs[i]; if (taxon.charAt(0) == '\'' || taxon.charAt(0) == '\"') { while (i < strs.length && taxon.charAt(0) != taxon.charAt(taxon.length() - 1)) { i++; if (i == strs.length) { throw new IOException("Unclosed quote starting with " + taxon); } taxon += " " + strs[i]; } taxon = taxon.substring(1, taxon.length() - 1); } taxa.add(taxon); taxonList.add(new Taxon(taxon)); } } } while (!str.toLowerCase().replaceAll(";", "").equals("end")); } } while (!str.toLowerCase().replaceAll(";", "").equals("end")); if (expectedTaxonCount >= 0 && taxa.size() != expectedTaxonCount) { throw new IOException("Number of taxa (" + taxa.size() + ") is not equal to 'dimension' " + "field (" + expectedTaxonCount + ") specified in 'taxa' block"); } } /** * parse calibrations block and create TraitSet * */ TraitSet parseCalibrationsBlock(final BufferedReader fin) throws IOException { final TraitSet traitSet = new TraitSet(); traitSet.traitNameInput.setValue("date", traitSet); String str; do { str = nextLine(fin); if (str.toLowerCase().contains("options")) { String scale = getAttValue("scale", str); if (scale.endsWith("s")) { scale = scale.substring(0, scale.length() - 1); } traitSet.unitsInput.setValue(scale, traitSet); } } while (str.toLowerCase().contains("tipcalibration")); String text = ""; while (true) { str = nextLine(fin); if (str.contains(";")) { break; } text += str; } final String[] strs = text.split(","); text = ""; for (final String str2 : strs) { final String[] parts = str2.split(":"); final String date = parts[0].replaceAll(".*=\\s*", ""); final String[] taxa = parts[1].split("\\s+"); for (final String taxon : taxa) { if (!taxon.matches("^\\s*$")) { text += taxon + "=" + date + ",\n"; } } } text = text.substring(0, text.length() - 2); traitSet.traitsInput.setValue(text, traitSet); final TaxonSet taxa = new TaxonSet(); taxa.initByName("alignment", m_alignment); traitSet.taxaInput.setValue(taxa, traitSet); traitSet.initAndValidate(); return traitSet; } // parseCalibrations /** * parse data block and create Alignment * */ public Alignment parseDataBlock(final BufferedReader fin) throws IOException { final Alignment alignment = new Alignment(); String str; int taxonCount = -1; int charCount = -1; int totalCount = 4; String missing = "?"; String gap = "-"; // indicates character matches the one in the first sequence String matchChar = null; do { str = nextLine(fin); //dimensions ntax=12 nchar=898; if (str.toLowerCase().contains("dimensions")) { str = getNextDataBlock(str, fin); final String character = getAttValue("nchar", str); if (character == null) { throw new IOException("nchar attribute expected (e.g. 'dimensions char=123') expected, not " + str); } charCount = Integer.parseInt(character); final String taxa = getAttValue("ntax", str); if (taxa != null) { taxonCount = Integer.parseInt(taxa); } } else if (str.toLowerCase().contains("format")) { str = getNextDataBlock(str, fin); //format datatype=dna interleave=no gap=-; final String dataTypeName = getAttValue("datatype", str); final String symbols; if (getAttValue("symbols", str) == null) { symbols = getAttValue("symbols", str); } else { symbols = getAttValue("symbols", str).replaceAll("\\s", ""); } if (dataTypeName == null) { Log.warning.println("Warning: expected datatype (e.g. something like 'format datatype=dna;') not '" + str + "' Assuming integer dataType"); alignment.dataTypeInput.setValue("integer", alignment); if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) { totalCount = symbols.length(); } } else if (dataTypeName.toLowerCase().equals("rna") || dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) { alignment.dataTypeInput.setValue("nucleotide", alignment); totalCount = 4; } else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) { alignment.dataTypeInput.setValue("aminoacid", alignment); totalCount = 20; } else if (dataTypeName.toLowerCase().equals("standard")) { alignment.dataTypeInput.setValue("standard", alignment); totalCount = symbols.length(); // if (symbols == null || symbols.equals("01")) { // alignment.dataTypeInput.setValue("binary", alignment); // totalCount = 2; // } else { // alignment.dataTypeInput.setValue("standard", alignment); // totalCount = symbols.length(); // } } else if (dataTypeName.toLowerCase().equals("binary")) { alignment.dataTypeInput.setValue("binary", alignment); totalCount = 2; } else { alignment.dataTypeInput.setValue("integer", alignment); if (symbols != null && (symbols.equals("01") || symbols.equals("012"))) { totalCount = symbols.length(); } } final String missingChar = getAttValue("missing", str); if (missingChar != null) { missing = missingChar; } final String gapChar = getAttValue("gap", str); if (gapChar != null) { gap = gapChar; } matchChar = getAttValue("matchchar", str); } } while (!str.trim().toLowerCase().startsWith("matrix") && !str.toLowerCase().contains("charstatelabels")); if (alignment.dataTypeInput.get().equals("standard")) { StandardData type = new StandardData(); type.setInputValue("nrOfStates", totalCount); //type.setInputValue("symbols", symbols); type.initAndValidate(); alignment.setInputValue("userDataType", type); } //reading CHARSTATELABELS block if (str.toLowerCase().contains("charstatelabels")) { if (!alignment.dataTypeInput.get().equals("standard")) { throw new IllegalArgumentException("If CHARSTATELABELS block is specified then DATATYPE has to be Standard"); } StandardData standardDataType = (StandardData)alignment.userDataTypeInput.get(); int[] maxNumberOfStates = new int[] {0}; ArrayList<String> tokens = readInCharstatelablesTokens(fin); ArrayList<UserDataType> charDescriptions = processCharstatelabelsTokens(tokens, maxNumberOfStates); // while (true) { // str = nextLine(fin); // if (str.contains(";")) { // break; // } // String[] strSplit = str.split("/"); // ArrayList<String> states = new ArrayList<>(); // // if (strSplit.length < 2) { // charDescriptions.add(new UserDataType(strSplit[0], states)); // continue; // } // // String stateStr = strSplit[1]; // // //add a comma at the end of the string if the last non-whitespace character is not a comma or all the // // characters are whitespaces in the string. Also remove whitespaces at the end of the string. // for (int i=stateStr.length()-1; i>=0; i--) { // if (!Character.isWhitespace(stateStr.charAt(i))) { // if (stateStr.charAt(i-1) != ',') { // stateStr = stateStr.substring(0, i)+","; // break; // } // } // if (i==0) { // stateStr = stateStr.substring(0, i)+","; // } // } // if (stateStr.isEmpty()) { // stateStr = stateStr+","; // } // // final int WAITING=0, WORD=1, PHRASE_IN_QUOTES=2; // int mode =WAITING; //0 waiting for non-space letter, 1 reading a word; 2 reading a phrase in quotes // int begin =0, end; // // for (int i=0; i< stateStr.length(); i++) { // switch (mode) { // case WAITING: // while (stateStr.charAt(i) == ' ') { // i++; // } // mode = stateStr.charAt(i) == '\'' ? PHRASE_IN_QUOTES : WORD; // begin = i; // break; // case WORD: // end = stateStr.indexOf(" ", begin) != -1 ? stateStr.indexOf(" ", begin) : stateStr.indexOf(",", begin); // states.add(stateStr.substring(begin, end)); // i=end; // mode = WAITING; // break; // case PHRASE_IN_QUOTES: // end = begin; // do { // end = stateStr.indexOf("'", end+2); // } while (stateStr.charAt(end+1) == '\'' || end == -1); // if (end == -1) { // Log.info.println("Incorrect description in charstatelabels. Single quote found in line "); // } // end++; // states.add(stateStr.substring(begin, end)); // i=end; // mode=WAITING; // break; // default: // break; // } // } // // oldTODO make strSplit[0] look nicer (remove whitespaces and may be numbers at the beginning) // charDescriptions.add(new UserDataType(strSplit[0], states)); // maxNumberOfStates = Math.max(maxNumberOfStates, states.size()); // } standardDataType.setInputValue("charstatelabels", charDescriptions); standardDataType.setInputValue("nrOfStates", Math.max(maxNumberOfStates[0], totalCount)); standardDataType.initAndValidate(); for (UserDataType dataType : standardDataType.charStateLabelsInput.get()) { dataType.initAndValidate(); } } //skipping before MATRIX block while (!str.toLowerCase().contains(("matrix"))) { str = nextLine(fin); } // read character data // Use string builder for efficiency final Map<String, StringBuilder> seqMap = new HashMap<>(); final List<String> taxa = new ArrayList<>(); String prevTaxon = null; int seqLen = 0; while (true) { str = nextLine(fin); int start = 0, end; final String taxon; while (Character.isWhitespace(str.charAt(start))) { start++; } if (str.charAt(start) == '\'' || str.charAt(start) == '\"') { final char c = str.charAt(start); start++; end = start; while (str.charAt(end) != c) { end++; } taxon = str.substring(start, end); seqLen = 0; end++; } else { end = start; while (end < str.length() && !Character.isWhitespace(str.charAt(end))) { end++; } if (end < str.length()) { taxon = str.substring(start, end); seqLen = 0; } else if ((prevTaxon == null || seqLen == charCount) && end == str.length()) { taxon = str.substring(start, end); seqLen = 0; } else { taxon = prevTaxon; if (taxon == null) { throw new IOException("Could not recognise taxon"); } end = start; } } prevTaxon = taxon; String data = str.substring(end); for (int k = 0; k < data.length(); k++) { if (!Character.isWhitespace(data.charAt(k))) { seqLen++; } } // Do this once outside loop- save on multiple regex compilations //data = data.replaceAll("\\s", ""); // String [] strs = str.split("\\s+"); // String taxon = strs[0]; // for (int k = 1; k < strs.length - 1; k++) { // taxon += strs[k]; // } // taxon = taxon.replaceAll("'", ""); // Log.warning.println(taxon); // String data = strs[strs.length - 1]; data = data.replaceAll(";", ""); if (data.trim().length() > 0) { if (seqMap.containsKey(taxon)) { seqMap.put(taxon, seqMap.get(taxon).append(data)); } else { seqMap.put(taxon, new StringBuilder(data)); taxa.add(taxon); } } if (str.contains(";")) { break; } } if (taxonCount > 0 && taxa.size() > taxonCount) { throw new IOException("Wrong number of taxa. Perhaps a typo in one of the taxa: " + taxa); } HashSet<String> sortedAmbiguities = new HashSet<>(); for (final String taxon : taxa) { taxonList.add(new Taxon(taxon)); final StringBuilder bsData = seqMap.get(taxon); String data = bsData.toString().replaceAll("\\s", ""); seqMap.put(taxon, new StringBuilder(data)); //collect all ambiguities in the sequence List<String> ambiguities = new ArrayList<>(); Matcher m = Pattern.compile("\\{(.*?)\\}").matcher(data); while (m.find()) { int mLength = m.group().length(); ambiguities.add(m.group().substring(1, mLength-1)); } //sort elements of ambiguity sets String data_without_ambiguities = data.replaceAll("\\{(.*?)\\}", "?"); for (String amb : ambiguities) { List<Integer> ambInt = new ArrayList<>(); for (int i=0; i<amb.length(); i++) { char c = amb.charAt(i); if (c >= '0' && c <= '9') { ambInt.add(Integer.parseInt(amb.charAt(i) + "")); } else { // ignore if (data != data_without_ambiguities) { Log.warning.println("Ambiguity found in " + taxon + " that is treated as missing value"); } data = data_without_ambiguities; } } Collections.sort(ambInt); String ambStr = ""; for (int i=0; i<ambInt.size(); i++) { ambStr += Integer.toString(ambInt.get(i)); } sortedAmbiguities.add(ambStr); } //check the length of the sequence (treat ambiguity sets as single characters) if (data_without_ambiguities.length() != charCount) { throw new IOException(str + "\nExpected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon); } // map to standard missing and gap chars data = data.replace(missing.charAt(0), DataType.MISSING_CHAR); data = data.replace(gap.charAt(0), DataType.GAP_CHAR); // resolve matching char, if any if (matchChar != null && data.contains(matchChar)) { final char cMatchChar = matchChar.charAt(0); final String baseData = seqMap.get(taxa.get(0)).toString(); for (int i = 0; i < data.length(); i++) { if (data.charAt(i) == cMatchChar) { final char cReplaceChar = baseData.charAt(i); data = data.substring(0, i) + cReplaceChar + (i + 1 < data.length() ? data.substring(i + 1) : ""); } } } // Using Alignment as Map gives problems when producing XML: // Sequence names are used as attribute names, producing very readable XML // However, since attribute names cannot start with a number or contain // special characters (like ":" or "]") but sequence names do contain them // on occasion, it is more robust to create a Sequence object for each // sequence where the taxon name is stored as an XML attribute values // that do not have the attribute name restrictions. // if (alignment.dataTypeInput.get().equals("nucleotide") || // alignment.dataTypeInput.get().equals("binary") || // alignment.dataTypeInput.get().equals("aminoacid") ) { // alignment.setInputValue(taxon, data); // } else { final Sequence sequence = new Sequence(); sequence.init(totalCount, taxon, data); sequence.setID(generateSequenceID(taxon)); alignment.sequenceInput.setValue(sequence, alignment); // } } if (alignment.dataTypeInput.get().equals("standard")) { //convert sortedAmbiguities to a whitespace separated string of ambiguities String ambiguitiesStr = ""; for (String amb: sortedAmbiguities) { ambiguitiesStr += amb + " "; } if (ambiguitiesStr.length() > 0) { ambiguitiesStr = ambiguitiesStr.substring(0, ambiguitiesStr.length()-1); } alignment.userDataTypeInput.get().initByName("ambiguities", ambiguitiesStr); } alignment.initAndValidate(); if (taxonCount > 0 && taxonCount != alignment.getTaxonCount()) { throw new IOException("dimensions block says there are " + taxonCount + " taxa, but there were " + alignment.getTaxonCount() + " taxa found"); } return alignment; } // parseDataBlock private String getNextDataBlock(String str, BufferedReader fin) throws IOException { while (str.indexOf(';') < 0) { str += nextLine(fin); } str = str.replace(";", " "); if (str.toLowerCase().matches(".*matrix.*")) { // will only get here when there throw new IllegalArgumentException("Mallformed nexus file: perhaps a semi colon is missing before 'matrix'"); } return str; } /** * parse assumptions block * begin assumptions; * charset firsthalf = 1-449; * charset secondhalf = 450-898; * charset third = 1-457\3 662-896\3; * end; * * begin assumptions; * wtset MySoapWeights (VECTOR) = 13 13 13 50 50 88 8 * end; * */ void parseAssumptionsBlock(final BufferedReader fin) throws IOException { String str; do { str = nextLine(fin); if (str.toLowerCase().matches("\\s*charset\\s.*")) { // remove text in brackets (as TreeBase files are wont to contain) str = str.replaceAll("\\(.*\\)", ""); // clean up spaces str = str.replaceAll("^\\s+", ""); str = str.replaceAll("\\s*-\\s*", "-"); str = str.replaceAll("\\s*\\\\\\s*", "\\\\"); str = str.replaceAll("\\s*;", ""); final String[] strs = str.trim().split("\\s+"); final String id = strs[1]; String rangeString = ""; for (int i = 3; i < strs.length; i++) { rangeString += strs[i] + " "; } rangeString = rangeString.trim().replace(' ', ','); final FilteredAlignment alignment = new FilteredAlignment(); alignment.setID(id); alignment.alignmentInput.setValue(m_alignment, alignment); alignment.filterInput.setValue(rangeString, alignment); alignment.initAndValidate(); filteredAlignments.add(alignment); } else if (str.toLowerCase().matches("\\s*wtset\\s.*")) { String [] strs = str.split("="); if (strs.length > 1) { str = strs[strs.length - 1].trim(); strs = str.split("\\s+"); int [] weights = new int[strs.length]; for (int i = 0; i< strs.length; i++) { weights[i] = Integer.parseInt(strs[i]); } if (m_alignment != null) { if (weights.length != m_alignment.getSiteCount()) { throw new RuntimeException("Number of weights (" + weights.length+ ") " + "does not match number of sites in alignment(" + m_alignment.getSiteCount()+ ")"); } StringBuilder weightStr = new StringBuilder(); for (String str2 : strs) { weightStr.append(str2); weightStr.append(','); } weightStr.delete(weightStr.length() - 1, weightStr.length()); m_alignment.siteWeightsInput.setValue(weightStr.toString(), m_alignment); m_alignment.initAndValidate(); } else { Log.warning.println("WTSET was specified before alignment. WTSET is ignored."); } } } else if (str.toLowerCase().matches("\\s*taxset\\s.*")) { String [] strs = str.split("="); if (strs.length > 1) { String str0 = strs[0].trim(); String [] strs2 = str0.split("\\s+"); if (strs2.length != 2) { throw new RuntimeException("expected 'taxset <name> = ...;' but did not get two words before the = sign: " + str); } String taxonSetName = strs2[1]; str0 = strs[strs.length - 1].trim(); if (!str0.endsWith(";")) { Log.warning.println("expected 'taxset <name> = ...;' semi-colin is missing: " + str + "\n" + "Taxa from following lines may be missing."); } str0 = str0.replaceAll(";", ""); String [] taxonNames = str0.split("\\s+"); TaxonSet taxonset = new TaxonSet(); for (String taxon : taxonNames) { taxonset.taxonsetInput.get().add(new Taxon(taxon.replaceAll("'\"", ""))); } taxonset.setID(taxonSetName.replaceAll("'\"", "")); taxonsets.add(taxonset); } } else if (str.toLowerCase().matches("^\\s*calibrate\\s.*")) { // define calibration represented by an MRCAPRior, // taxon sets need to be specified earlier, but can also be a single taxon // e.g. // begin mrbayes; // calibrate germanic = normal(1000,50) // calibrate hittite = normal(3450,100) // calibrate english = fixed(0) // end; String [] strs = str.split("="); if (strs.length > 1) { String str0 = strs[0].trim(); String [] strs2 = str0.split("\\s+"); if (strs2.length != 2) { throw new RuntimeException("expected 'calibrate <name> = ...' but did not get two words before the = sign: " + str); } // first, get the taxon String taxonSetName = strs2[1].replaceAll("'\"", ""); TaxonSet taxonset = null; for (Taxon t : taxonsets) { if (t.getID().equals(taxonSetName) && t instanceof TaxonSet) { taxonset = (TaxonSet) t; } } if (taxonset == null) { // perhaps it is a singleton for (Taxon t : taxonList) { if (t.getID().equals(taxonSetName)) { taxonset = new TaxonSet(); taxonset.setID(t.getID() + ".leaf"); taxonset.taxonsetInput.setValue(t, taxonset); } } } if (taxonset == null) { throw new RuntimeException("Could not find taxon/taxonset " + taxonSetName + " in calibration: " + str); } // next get the calibration str0 = strs[strs.length - 1].trim(); String [] strs3 = str0.split("[\\(,\\)]"); RealParameter [] param = new RealParameter[strs3.length]; for (int i = 1; i < strs3.length; i++) { try { param[i] = new RealParameter(strs3[i]); param[i].setID("param." + i); } catch (Exception e) { // ignore parsing errors } } ParametricDistribution distr = null; switch (strs3[0]) { case "normal": distr = new Normal(); distr.initByName("mean", param[1], "sigma", param[2]); distr.setID("Normal.0"); break; case "uniform": distr = new Uniform(); distr.initByName("lower", strs3[1], "upper", strs3[2]); distr.setID("Uniform.0"); break; case "fixed": // uniform with lower == upper distr = new Normal(); distr.initByName("mean", param[1], "sigma", "+Infinity"); distr.setID("Normal.0"); break; case "offsetlognormal": distr = new LogNormalDistributionModel(); distr.initByName("offset", strs3[1], "M", param[2], "S", param[3], "meanInRealSpace", true); distr.setID("LogNormalDistributionModel.0"); break; case "lognormal": distr = new LogNormalDistributionModel(); distr.initByName("M", param[1], "S", param[2], "meanInRealSpace", true); distr.setID("LogNormalDistributionModel.0"); break; case "offsetexponential": distr = new Exponential(); distr.initByName("offset", strs3[1], "mean", param[2]); distr.setID("Exponential.0"); break; case "gamma": distr = new Gamma(); distr.initByName("alpha", param[1], "beta", param[2]); distr.setID("Gamma.0"); break; case "offsetgamma": distr = new Gamma(); distr.initByName("offset", strs3[1], "alpha", param[2], "beta", param[3]); distr.setID("Gamma.0"); break; default: throw new RuntimeException("Unknwon distribution "+ strs3[0] +"in calibration: " + str); } MRCAPrior prior = new MRCAPrior(); prior.isMonophyleticInput.setValue(true, prior); prior.distInput.setValue(distr, prior); prior.taxonsetInput.setValue(taxonset, prior); prior.setID(taxonset.getID() + ".prior"); // should set Tree before initialising, but we do not know the tree yet... if (calibrations == null) { calibrations = new ArrayList<>(); } calibrations.add(prior); } } } while (!str.toLowerCase().contains("end;")); } private void processSets() { // create monophyletic MRCAPrior for each taxon set that // does not already have a calibration associated with it for (TaxonSet taxonset : taxonsets) { boolean found = false; for (BEASTInterface o : taxonset.getOutputs()) { if (o instanceof MRCAPrior) { found = true; break; } } if (!found) { MRCAPrior prior = new MRCAPrior(); prior.isMonophyleticInput.setValue(true, prior); prior.taxonsetInput.setValue(taxonset, prior); prior.setID(taxonset.getID() + ".prior"); // should set Tree before initialising, but we do not know the tree yet... if (calibrations == null) { calibrations = new ArrayList<>(); } calibrations.add(prior); } } } /** * parse sets block * BEGIN Sets; * TAXSET 'con' = 'con_SL_Gert2' 'con_SL_Tran6' 'con_SL_Tran7' 'con_SL_Gert6'; * TAXSET 'spa' = 'spa_138a_Cerb' 'spa_JB_Eyre1' 'spa_JB_Eyre2'; * END; [Sets] */ void parseSetsBlock(final BufferedReader fin) throws IOException { String str; do { str = nextLine(fin); if (str.toLowerCase().matches("\\s*taxset\\s.*")) { String [] strs = str.split("="); if (strs.length > 1) { String str0 = strs[0].trim(); String [] strs2 = str0.split("\\s+"); if (strs2.length != 2) { throw new RuntimeException("expected 'taxset <name> = ...;' but did not get two words before the = sign: " + str); } String taxonSetName = strs2[1]; str0 = strs[strs.length - 1].trim(); if (!str0.endsWith(";")) { Log.warning.println("expected 'taxset <name> = ...;' semi-colin is missing: " + str + "\n" + "Taxa from following lines may be missing."); } str0 = str0.replaceAll(";", ""); String [] taxonNames = str0.split("\\s+"); TaxonSet taxonset = new TaxonSet(); for (String taxon : taxonNames) { taxonset.taxonsetInput.get().add(new Taxon(taxon.replaceAll("'\"", ""))); } taxonset.setID(taxonSetName.replaceAll("'\"", "")); taxonsets.add(taxonset); } } } while (!str.toLowerCase().contains("end;")); } public static String generateSequenceID(final String taxon) { String id = "seq_" + taxon; int i = 0; while (g_sequenceIDs.contains(id + (i > 0 ? i : ""))) { i++; } id = id + (i > 0 ? i : ""); g_sequenceIDs.add(id); return id; } /** * read line from nexus file * */ String readLine(final BufferedReader fin) throws IOException { if (!fin.ready()) { return null; } lineNr++; return fin.readLine(); } /** * read next line from nexus file that is not a comment and not empty * */ String nextLine(final BufferedReader fin) throws IOException { String str = readLine(fin); if (str == null) { return null; } if (str.contains("[")) { final int start = str.indexOf('['); int end = str.indexOf(']', start); while (end < 0) { str += readLine(fin); end = str.indexOf(']', start); } str = str.substring(0, start) + str.substring(end + 1); if (str.matches("^\\s*$")) { return nextLine(fin); } } if (str.matches("^\\s*$")) { return nextLine(fin); } return str; } /** * return attribute value as a string * */ String getAttValue(final String attribute, final String str) { final Pattern pattern = Pattern.compile(".*" + attribute + "\\s*=\\s*([^\\s;]+).*"); final Matcher matcher = pattern.matcher(str.toLowerCase()); if (!matcher.find()) { return null; } String att = matcher.group(1); if (att.startsWith("\"") && att.endsWith("\"")) { final int start = matcher.start(1); att = str.substring(start + 1, str.indexOf('"', start + 1)); } return att; } private ArrayList<String> readInCharstatelablesTokens(final BufferedReader fin) throws IOException { ArrayList<String> tokens = new ArrayList<>(); String token=""; final int READING=0, OPENQUOTE=1, WAITING=2; int mode = WAITING; int numberOfQuotes=0; boolean endOfBlock=false; String str; while (!endOfBlock) { str = nextLine(fin); Character nextChar; for (int i=0; i< str.length(); i++) { nextChar=str.charAt(i); switch (mode) { case WAITING: if (!Character.isWhitespace(nextChar)) { if (nextChar == '\'') { mode=OPENQUOTE; } else if (nextChar == '/' || nextChar == ',') { tokens.add(nextChar.toString()); token=""; } else if (nextChar == ';') { endOfBlock = true; } else { token=token+nextChar; mode=READING; } } break; case READING: if (nextChar == '\'') { tokens.add(token); token=""; mode=OPENQUOTE; } else if (nextChar == '/' || nextChar == ',') { tokens.add(token); tokens.add(nextChar.toString()); token=""; mode=WAITING; } else if (nextChar == ';') { tokens.add(token); endOfBlock = true; } else if (Character.isWhitespace(nextChar)) { tokens.add(token); token=""; mode=WAITING; } else { token=token+nextChar; } break; case OPENQUOTE: if (nextChar == '\'') { numberOfQuotes++; } else { if (numberOfQuotes % 2 == 0) { for (int ind=0; ind< numberOfQuotes/2; ind++) { token=token+"'"; } token=token+nextChar; } else { for (int ind=0; ind< numberOfQuotes/2; ind++) { token=token+"'"; } tokens.add(token); token=""; if (nextChar == '/' || nextChar == ',') { tokens.add(nextChar.toString()); mode=WAITING; } else if (nextChar == ';') { endOfBlock = true; } else if (Character.isWhitespace(nextChar)) { mode=WAITING; } else { token=token+nextChar; mode=READING; } } numberOfQuotes=0; } break; default: break; } } } if (!tokens.get(tokens.size()-1).equals(",")) { tokens.add(","); } return tokens; } private ArrayList<UserDataType> processCharstatelabelsTokens(ArrayList<String> tokens, int[] maxNumberOfStates) throws IOException { ArrayList<UserDataType> charDescriptions = new ArrayList<>(); final int CHAR_NR=0, CHAR_NAME=1, STATES=2; int mode = CHAR_NR; int charNumber = -1; String charName = ""; ArrayList<String> states = new ArrayList<>(); for (String token:tokens) { switch (mode) { case CHAR_NR: charNumber = Integer.parseInt(token); mode = CHAR_NAME; break; case CHAR_NAME: if (token.equals("/")) { mode = STATES; } else if (token.equals(",")) { if (charNumber > charDescriptions.size()+1) { throw new IOException("Character descriptions should go in the ascending order and there " + "should not be any description missing."); } charDescriptions.add(new UserDataType(charName, states)); maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size()); charNumber = -1; charName = ""; states = new ArrayList<>(); mode = CHAR_NR; } else { charName = token; } break; case STATES: if (token.equals(",")) { if (charNumber > charDescriptions.size()+1) { throw new IOException("Character descriptions should go in the ascending order and there " + "should not be any description missing."); } charDescriptions.add(new UserDataType(charName, states)); maxNumberOfStates[0] = Math.max(maxNumberOfStates[0], states.size()); charNumber = -1; charName = ""; states = new ArrayList<>(); mode = CHAR_NR; } else { states.add(token); } default: break; } } return charDescriptions; } public static void main(final String[] args) { try { final NexusParser parser = new NexusParser(); parser.parseFile(new File(args[0])); if (parser.taxa != null) { System.out.println(parser.taxa.size() + " taxa"); System.out.println(Arrays.toString(parser.taxa.toArray(new String[parser.taxa.size()]))); } if (parser.trees != null) { System.out.println(parser.trees.size() + " trees"); } if (parser.m_alignment != null) { final String xml = new XMLProducer().toXML(parser.m_alignment); System.out.println(xml); } if (parser.traitSet != null) { final String xml = new XMLProducer().toXML(parser.traitSet); System.out.println(xml); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } // main } // class NexusParser