/* * FrequencyModelParser.java * * Copyright (c) 2002-2016 Alexei Drummond, Andrew Rambaut and Marc Suchard * * This file is part of BEAST. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * BEAST is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * BEAST is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with BEAST; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package dr.evomodelxml.substmodel; import dr.evomodel.substmodel.FrequencyModel; import dr.app.bss.Utils; import dr.evolution.alignment.PatternList; import dr.evolution.datatype.Codons; import dr.evolution.datatype.DataType; import dr.evolution.datatype.HiddenDataType; import dr.evolution.datatype.Nucleotides; import dr.evoxml.util.DataTypeUtils; import dr.inference.model.Parameter; import dr.xml.*; import java.text.NumberFormat; import java.util.Arrays; import java.util.List; import java.util.logging.Logger; /** * @author Alexei Drummond * @author Andrew Rambaut */ public class FrequencyModelParser extends AbstractXMLObjectParser { public static final String FREQUENCIES = "frequencies"; public static final String FREQUENCY_MODEL = "frequencyModel"; public static final String NORMALIZE = "normalize"; public static final String COMPRESS = "compress"; public static final String COMPOSITION = "composition"; public static final String FREQ_3x4 = "3x4"; public static final String[] COMPOSITION_TYPES = new String[] { FREQ_3x4 }; public String getParserName() { return FREQUENCY_MODEL; } public Object parseXMLObject(XMLObject xo) throws XMLParseException { DataType dataType = DataTypeUtils.getDataType(xo); Parameter freqsParam = (Parameter) xo.getElementFirstChild(FREQUENCIES); double[] frequencies = null; for (int i = 0; i < xo.getChildCount(); i++) { Object obj = xo.getChild(i); if (obj instanceof PatternList) { PatternList patternList = (PatternList) obj; if (xo.getAttribute(COMPRESS, false) && (patternList.getDataType() instanceof HiddenDataType)) { double[] hiddenFrequencies = patternList .getStateFrequencies(); int hiddenCount = ((HiddenDataType) patternList .getDataType()).getHiddenClassCount(); int baseStateCount = hiddenFrequencies.length / hiddenCount; frequencies = new double[baseStateCount]; for (int j = 0; j < baseStateCount; ++j) { for (int k = 0; k < hiddenCount; ++k) { frequencies[j] += hiddenFrequencies[j + k * baseStateCount]; } } } else { // TODO if (xo.hasAttribute(COMPOSITION)) { String type = xo.getStringAttribute(COMPOSITION); if (type.equalsIgnoreCase(FREQ_3x4)) { frequencies = getEmpirical3x4Freqs(patternList); } } else { frequencies = patternList.getStateFrequencies(); } // END: composition check } break; }// END: patternList check } StringBuilder sb = new StringBuilder( "\nCreating state frequencies model '" + freqsParam.getParameterName() + "': "); if (frequencies != null) { if (freqsParam.getDimension() != frequencies.length) { throw new XMLParseException( "dimension of frequency parameter and number of sequence states don't match."); } for (int j = 0; j < frequencies.length; j++) { freqsParam.setParameterValue(j, frequencies[j]); } sb.append("Using empirical frequencies from data "); } else { sb.append("Initial frequencies "); } sb.append("= {"); if (xo.getAttribute(NORMALIZE, false)) { double sum = 0; for (int j = 0; j < freqsParam.getDimension(); j++) sum += freqsParam.getParameterValue(j); for (int j = 0; j < freqsParam.getDimension(); j++) { if (sum != 0) freqsParam.setParameterValue(j, freqsParam.getParameterValue(j) / sum); else freqsParam.setParameterValue(j, 1.0 / freqsParam.getDimension()); } } NumberFormat format = NumberFormat.getNumberInstance(); format.setMaximumFractionDigits(5); sb.append(format.format(freqsParam.getParameterValue(0))); for (int j = 1; j < freqsParam.getDimension(); j++) { sb.append(", "); sb.append(format.format(freqsParam.getParameterValue(j))); } sb.append("}"); Logger.getLogger("dr.evomodel").info(sb.toString()); return new FrequencyModel(dataType, freqsParam); }// END: parseXMLObject private double[] getEmpirical3x4Freqs(PatternList patternList) { DataType nucleotideDataType = Nucleotides.INSTANCE; Codons codonDataType = Codons.UNIVERSAL; List<String> stopCodonsList = Arrays.asList(Utils.STOP_CODONS); int cStateCount = codonDataType.getStateCount(); int nStateCount = nucleotideDataType.getStateCount(); int npos = 3; double[] stopCodonFreqs = new double[Utils.STOP_CODONS.length]; double counts[][] = new double[nStateCount][npos]; int countsPos[] = new int[npos]; int patternCount = patternList.getPatternCount(); for (int i = 0; i < patternCount; i++) { int[] sitePatterns = patternList.getPattern(i); for (int k = 0; k < sitePatterns.length; k++) { int codonState = sitePatterns[k]; int[] nucleotideStates = codonDataType.getTripletStates(codonState); String triplet = codonDataType.getTriplet(codonState); // if(triplet.equals("TAA")) { // System.err.println(triplet); // } if(stopCodonsList.contains(triplet)) { int stopCodonIndex = stopCodonsList.indexOf(triplet); stopCodonFreqs[stopCodonIndex]++; }//END: stopCodon check for(int pos = 0; pos < npos; pos++) { int nucleotideState = nucleotideStates[pos]; counts[nucleotideState][pos]++; countsPos[pos]++; }//END: nucleotide positions loop }//END: sitePatterns loop }// sites loop int total = 0; for (int pos = 0; pos < npos; pos++) { int totalPos = countsPos[pos]; for (int s = 0; s < nStateCount; s++) { counts[s][pos] = counts[s][pos] / totalPos; }//END: nucleotide states loop total += totalPos; }//END: nucleotide positions loop // Utils.printArray(stopCodonFreqs); // System.out.println(stopCodonFreqs.length); // add stop codon frequencies double pi = 0.0; for(int i=0;i<stopCodonFreqs.length;i++) { double freq = stopCodonFreqs[i] / total; pi += freq; } // System.out.println(pi); double[] freqs = new double[cStateCount]; Arrays.fill(freqs, 1.0); for (int codonState = 0; codonState < cStateCount; codonState++) { int[] nucleotideStates = codonDataType.getTripletStates(codonState); for (int pos = 0; pos < npos; pos++) { int nucleotide = nucleotideStates[pos]; freqs[codonState] *= counts[nucleotide][pos]; }// END: nucleotide positions loop // TODO: stop codons freqs freqs[codonState] = freqs[codonState] / (1-pi); }//END: codon states loop return freqs; }// END: getEmpirical3x4Freqs public String getParserDescription() { return "A model of equilibrium base frequencies."; } public Class getReturnType() { return FrequencyModel.class; } public XMLSyntaxRule[] getSyntaxRules() { return rules; } private final XMLSyntaxRule[] rules = { new StringAttributeRule(COMPOSITION, "Composition type", COMPOSITION_TYPES, true), AttributeRule.newBooleanRule(NORMALIZE, true), AttributeRule.newBooleanRule(COMPRESS, true), new ElementRule(PatternList.class, "Initial value", 0, 1), new XORRule(new StringAttributeRule(DataType.DATA_TYPE, "The type of sequence data", DataType.getRegisteredDataTypeNames(), false), new ElementRule(DataType.class)), new ElementRule(FREQUENCIES, new XMLSyntaxRule[] { new ElementRule( Parameter.class) }), }; }