/* * File Alignment.java * * Copyright (C) 2010 Remco Bouckaert remco@cs.auckland.ac.nz * * This file is part of BEAST2. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * BEAST is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * BEAST is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with BEAST; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package beast.evolution.alignment; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import beast.core.Description; import beast.core.Input; import beast.core.Input.Validate; import beast.core.parameter.Map; import beast.core.util.Log; import beast.evolution.datatype.DataType; import beast.evolution.datatype.StandardData; import beast.util.AddOnManager; @Description("Class representing alignment data") public class Alignment extends Map<String> { @Override protected Class<?> mapType() { return String.class; } /** * default data type * */ protected final static String NUCLEOTIDE = "nucleotide"; /** * directory to pick up data types from * */ final static String[] IMPLEMENTATION_DIR = {"beast.evolution.datatype"}; /** * list of data type descriptions, obtained from DataType classes * */ static List<String> types = new ArrayList<>(); static { findDataTypes(); } static public void findDataTypes() { // build up list of data types List<String> m_sDataTypes = AddOnManager.find(beast.evolution.datatype.DataType.class, IMPLEMENTATION_DIR); for (String dataTypeName : m_sDataTypes) { try { DataType dataType = (DataType) Class.forName(dataTypeName).newInstance(); if (dataType.isStandard()) { String description = dataType.getTypeDescription(); if (!types.contains(description)) { types.add(description); } } } catch (Exception e) { // TODO: handle exception } } } final public Input<List<Sequence>> sequenceInput = new Input<>("sequence", "sequence and meta data for particular taxon", new ArrayList<>(), Validate.OPTIONAL); final public Input<TaxonSet> taxonSetInput = new Input<>("taxa", "An optional taxon-set used only to sort the sequences into the same order as they appear in the taxon-set.", new TaxonSet(), Validate.OPTIONAL); final public Input<Integer> stateCountInput = new Input<>("statecount", "maximum number of states in all sequences"); final public Input<String> dataTypeInput = new Input<>("dataType", "data type, one of " + types, NUCLEOTIDE, types.toArray(new String[0])); final public Input<DataType.Base> userDataTypeInput = new Input<>("userDataType", "non-standard, user specified data type, if specified 'dataType' is ignored"); final public Input<Boolean> stripInvariantSitesInput = new Input<>("strip", "sets weight to zero for sites that are invariant (e.g. all 1, all A or all unkown)", false); final public Input<String> siteWeightsInput = new Input<>("weights", "comma separated list of weights, one for each site in the sequences. If not specified, each site has weight 1"); final public Input<Boolean> isAscertainedInput = new Input<>("ascertained", "is true if the alignment allows ascertainment correction, i.e., conditioning the " + "Felsenstein likelihood on excluding constant sites from the alignment", false); /** * Inputs from AscertainedAlignment */ final public Input<Integer> excludefromInput = new Input<>("excludefrom", "first site to condition on, default 0", 0); final public Input<Integer> excludetoInput = new Input<>("excludeto", "last site to condition on (but excluding this site), default 0", 0); final public Input<Integer> excludeeveryInput = new Input<>("excludeevery", "interval between sites to condition on (default 1)", 1); public Input<Integer> m_includefrom = new Input<>("includefrom","first site to condition on, default 0", 0); public Input<Integer> m_includeto = new Input<>("includeto","last site to condition on, default 0", 0); public Input<Integer> m_includeevery = new Input<>("includeevery","interval between sites to condition on (default 1)", 1); /** * list of sequences in the alignment * */ protected List<Sequence> sequences = new ArrayList<>(); /** * list of taxa names defined through the sequences in the alignment * */ protected List<String> taxaNames = new ArrayList<>(); /** * list of state counts for each of the sequences, typically these are * constant throughout the whole alignment. */ protected List<Integer> stateCounts = new ArrayList<>(); /** * maximum of m_nStateCounts * */ protected int maxStateCount; /** * state codes for the sequences * */ protected List<List<Integer>> counts = new ArrayList<>(); /** * data type, useful for converting String sequence to Code sequence, and back * */ protected DataType m_dataType; /** * weight over the columns of a matrix * */ protected int[] patternWeight; /** * weights of sites -- assumed 1 for each site if not specified */ protected int[] siteWeights = null; /** * Probabilities associated with each tip of the tree, for use when the * characters are uncertain. */ public List<double[][]> tipLikelihoods = new ArrayList<>(); // #taxa x #sites x #states private boolean usingTipLikelihoods = false; /** * pattern state encodings * */ protected int [][] sitePatterns; // #patterns x #taxa /** * maps site nr to pattern nr * */ protected int[] patternIndex; /** * From AscertainedAlignment */ Set<Integer> excludedPatterns; List<Integer> m_nIncluded; /** * A flag to indicate if the alignment is ascertained */ public boolean isAscertained; public Alignment() { } /** * Constructor for testing purposes. * * @param sequences * @param stateCount * @param dataType * @deprecated This is the deprecated legacy form and will be removed * at some point. Use {@link #Alignment(List, String)} instead. */ @Deprecated public Alignment(List<Sequence> sequences, Integer stateCount, String dataType) { this(sequences, dataType); } /** * Constructor for testing purposes. * * @param sequences * @param dataType */ public Alignment(List<Sequence> sequences, String dataType) { for (Sequence sequence : sequences) { sequenceInput.setValue(sequence, this); } dataTypeInput.setValue(dataType, this); initAndValidate(); } @Override public void initAndValidate() { if (sequenceInput.get().size() == 0 && defaultInput.get().size() == 0) { throw new IllegalArgumentException("Either a sequence input must be specified, or a map of strings must be specified"); } if (siteWeightsInput.get() != null) { String str = siteWeightsInput.get().trim(); String[] strs = str.split(","); siteWeights = new int[strs.length]; for (int i = 0; i < strs.length; i++) { siteWeights[i] = Integer.parseInt(strs[i].trim()); } } // determine data type, either user defined or one of the standard ones if (userDataTypeInput.get() != null) { m_dataType = userDataTypeInput.get(); } else { if (types.indexOf(dataTypeInput.get()) < 0) { throw new IllegalArgumentException("data type + '" + dataTypeInput.get() + "' cannot be found. " + "Choose one of " + Arrays.toString(types.toArray(new String[0]))); } // seems to spend forever in there?? List<String> dataTypes = AddOnManager.find(beast.evolution.datatype.DataType.class, IMPLEMENTATION_DIR); for (String dataTypeName : dataTypes) { DataType dataType; try { dataType = (DataType) Class.forName(dataTypeName).newInstance(); if (dataTypeInput.get().equals(dataType.getTypeDescription())) { m_dataType = dataType; break; } } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { throw new IllegalArgumentException(e.getMessage()); } } } // initialize the sequence list if (sequenceInput.get().size() > 0) { sequences = sequenceInput.get(); } else { // alignment defined by a map of id -> sequence List<String> taxa = new ArrayList<>(); taxa.addAll(map.keySet()); sequences.clear(); for (String key : taxa) { String sequence = map.get(key); sequences.add(new Sequence(key, sequence)); } } // initialize the alignment from the given list of sequences initializeWithSequenceList(sequences, true); if (taxonSetInput.get() != null && taxonSetInput.get().getTaxonCount() > 0) { sortByTaxonSet(taxonSetInput.get()); } Log.info.println(toString(false)); } /** * Initializes the alignment given the provided list of sequences and no other information. * It site weights and/or data type have been previously set up with initAndValidate then they * remain in place. This method is used mainly to re-order the sequences to a new taxon order * when an analysis of multiple alignments on the same taxa are undertaken. * * @param sequences */ private void initializeWithSequenceList(List<Sequence> sequences, boolean log) { this.sequences = sequences; taxaNames.clear(); stateCounts.clear(); counts.clear(); try { for (Sequence seq : sequences) { counts.add(seq.getSequence(m_dataType)); if (taxaNames.contains(seq.getTaxon())) { throw new RuntimeException("Duplicate taxon found in alignment: " + seq.getTaxon()); } taxaNames.add(seq.getTaxon()); tipLikelihoods.add(seq.getLikelihoods()); // if seq.isUncertain() == false then the above line adds 'null' // to the list, indicating that this particular sequence has no tip likelihood information usingTipLikelihoods |= (seq.getLikelihoods() != null); if (seq.totalCountInput.get() != null) { stateCounts.add(seq.totalCountInput.get()); } else { stateCounts.add(m_dataType.getStateCount()); } } if (counts.size() == 0) { // no sequence data throw new RuntimeException("Sequence data expected, but none found"); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } sanityCheckCalcPatternsSetUpAscertainment(log); } /** * Checks that sequences are all the same length, calculates patterns and sets up ascertainment. */ private void sanityCheckCalcPatternsSetUpAscertainment(boolean log) { // Sanity check: make sure sequences are of same length int length = counts.get(0).size(); if (!(m_dataType instanceof StandardData)) { for (List<Integer> seq : counts) { if (seq.size() != length) { throw new RuntimeException("Two sequences with different length found: " + length + " != " + seq.size()); } } } if (siteWeights != null && siteWeights.length != length) { throw new RuntimeException("Number of weights (" + siteWeights.length + ") does not match sequence length (" + length + ")"); } calcPatterns(log); setupAscertainment(); } /** * Sorts an alignment by a provided TaxonSet, so that the sequence/taxon pairs in the alignment match the order * that the taxa appear in the TaxonSet (i.e. not necessarily alphabetically). * * @param toSortBy the taxon set that species the order on the taxa. */ public void sortByTaxonSet(TaxonSet toSortBy) { List<Sequence> sortedSeqs = new ArrayList<>(); sortedSeqs.addAll(sequences); Collections.sort(sortedSeqs, (Sequence o1, Sequence o2) -> { return Integer.compare(toSortBy.getTaxonIndex(o1.getTaxon()), toSortBy.getTaxonIndex(o2.getTaxon())); } ); initializeWithSequenceList(sortedSeqs, false); } void setupAscertainment() { isAscertained = isAscertainedInput.get(); if (isAscertained) { //From AscertainedAlignment int from = excludefromInput.get(); int to = excludetoInput.get(); int every = excludeeveryInput.get(); excludedPatterns = new HashSet<>(); for (int i = from; i < to; i += every) { int patternIndex_ = patternIndex[i]; // reduce weight, so it does not confuse the tree likelihood patternWeight[patternIndex_] = 0; excludedPatterns.add(patternIndex_); } from = m_includefrom.get(); to = m_includeto.get(); every = m_includeevery.get(); m_nIncluded = new ArrayList<>(); for (int i = from; i < to; i += every) { int patternIndex_ = patternIndex[i]; // reduce weight, so it does not confuse the tree likelihood patternWeight[patternIndex_] = 0; m_nIncluded.add(patternIndex_); } } else { // sanity check int from = excludefromInput.get(); int to = excludetoInput.get(); if (from != excludefromInput.defaultValue || to != excludetoInput.defaultValue) { Log.warning.println("WARNING: excludefrom or excludeto is specified, but 'ascertained' flag is not set to true"); Log.warning.println("WARNING: to suppress this warning, remove the excludefrom or excludeto attributes (if no astertainment correction is required)"); Log.warning.println("WARNING: or set the 'ascertained' flag to true on element with id=" + getID()); } } } // initAndValidate static String getSequence(Alignment data, int taxonIndex) { int[] states = new int[data.getPatternCount()]; for (int i = 0; i < data.getPatternCount(); i++) { int[] sitePattern = data.getPattern(i); states[i] = sitePattern[taxonIndex]; } try { return data.getDataType().state2string(states); } catch (Exception e) { e.printStackTrace(); System.exit(1); } return null; } /* * assorted getters and setters * */ public List<String> getTaxaNames() { if (taxaNames.size() == 0) { try { initAndValidate(); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } } return taxaNames; } public List<Integer> getStateCounts() { return stateCounts; } /** * Returns a List of Integer Lists where each Integer List represents * the sequence corresponding to a taxon. The taxon is identified by * the position of the Integer List in the outer List, which corresponds * to the nodeNr of the corresponding leaf node and the position of the * taxon name in the taxaNames list. * * @return integer representation of sequence alignment */ public List<List<Integer>> getCounts() { return counts; } public DataType getDataType() { return m_dataType; } /** * @return number of taxa in Alignment. */ public int getTaxonCount() { //if (taxonsetInput.get() != null) { // return taxonsetInput.get().getTaxonCount(); //} return taxaNames.size(); } /** * @return number of taxa in Alignment. * @deprecated Use getTaxonCount() instead. */ @Deprecated public int getNrTaxa() { return getTaxonCount(); } public int getTaxonIndex(String id) { return taxaNames.indexOf(id); } /** * @return Number of unique character patterns in alignment. */ public int getPatternCount() { return sitePatterns.length; } public int[] getPattern(int patternIndex_) { return sitePatterns[patternIndex_]; } public int getPattern(int taxonIndex, int patternIndex_) { return sitePatterns[patternIndex_][taxonIndex]; } /** * Retrieve the "weight" of a particular pattern: the number of sites * having that pattern. * * @param patternIndex_ Index into pattern array. * @return pattern weight */ public int getPatternWeight(int patternIndex_) { return patternWeight[patternIndex_]; } public int getMaxStateCount() { return maxStateCount; } /** * Retrieve index of pattern corresponding to a particular site. * * @param site Index of site. * @return Index of pattern. */ public int getPatternIndex(int site) { return patternIndex[site]; } /** * @return Total number of sites in alignment. */ public int getSiteCount() { return patternIndex.length; } /** * Retrieve an array containing the number of times each character pattern * occurs in the alignment. * * @return Pattern weight array. */ public int[] getWeights() { return patternWeight; } /** * SiteComparator is used for ordering the sites, * which makes it easy to identify patterns. */ class SiteComparator implements Comparator<int[]> { @Override public int compare(int[] o1, int[] o2) { for (int i = 0; i < o1.length; i++) { if (o1[i] > o2[i]) { return 1; } if (o1[i] < o2[i]) { return -1; } } return 0; } } // class SiteComparator protected void calcPatterns() { calcPatterns(true); } /** * calculate patterns from sequence data * * */ private void calcPatterns(boolean log) { int taxonCount = counts.size(); int siteCount = counts.get(0).size(); // convert data to transposed int array int[][] data = new int[siteCount][taxonCount]; for (int i = 0; i < taxonCount; i++) { List<Integer> sites = counts.get(i); for (int j = 0; j < siteCount; j++) { data[j][i] = sites.get(j); } } // sort data SiteComparator comparator = new SiteComparator(); Arrays.sort(data, comparator); // count patterns in sorted data // if (siteWeights != null) the weights are recalculated below int patterns = 1; int[] weights = new int[siteCount]; weights[0] = 1; for (int i = 1; i < siteCount; i++) { if (usingTipLikelihoods || comparator.compare(data[i - 1], data[i]) != 0) { // In the case where we're using tip probabilities, we need to treat each // site as a unique pattern, because it could have a unique probability vector. patterns++; data[patterns - 1] = data[i]; } weights[patterns - 1]++; } // reserve memory for patterns patternWeight = new int[patterns]; sitePatterns = new int[patterns][taxonCount]; for (int i = 0; i < patterns; i++) { patternWeight[i] = weights[i]; sitePatterns[i] = data[i]; } // find patterns for the sites patternIndex = new int[siteCount]; for (int i = 0; i < siteCount; i++) { int[] sites = new int[taxonCount]; for (int j = 0; j < taxonCount; j++) { sites[j] = counts.get(j).get(i); } patternIndex[i] = Arrays.binarySearch(sitePatterns, sites, comparator); } if (siteWeights != null) { Arrays.fill(patternWeight, 0); for (int i = 0; i < siteCount; i++) { patternWeight[patternIndex[i]] += siteWeights[i]; } } // determine maximum state count // Usually, the state count is equal for all sites, // though for SnAP analysis, this is typically not the case. maxStateCount = 0; for (int m_nStateCount1 : stateCounts) { maxStateCount = Math.max(maxStateCount, m_nStateCount1); } // report some statistics if (log && taxaNames.size() < 30) { for (int i = 0; i < taxaNames.size(); i++) { Log.info.println(taxaNames.get(i) + ": " + counts.get(i).size() + " " + stateCounts.get(i)); } } if (stripInvariantSitesInput.get()) { // don't add patterns that are invariant, e.g. all gaps if (log) Log.info.println("Stripping invariant sites"); int removedSites = 0; for (int i = 0; i < patterns; i++) { int[] pattern = sitePatterns[i]; int value = pattern[0]; boolean isInvariant = true; for (int k = 1; k < pattern.length; k++) { if (pattern[k] != value) { isInvariant = false; break; } } if (isInvariant) { removedSites += patternWeight[i]; patternWeight[i] = 0; if (log) Log.info.print(" <" + value + "> "); } } if (log) Log.info.println(" removed " + removedSites + " sites "); } } // calcPatterns /** * @return the total weight of all the patterns (this is the effective number of sites) */ private long getTotalWeight() { long totalWeight = 0; for (int weight : patternWeight) { totalWeight += weight; } return totalWeight; } /** * Pretty printing of vital statistics of an alignment including id, #taxa, #sites, #patterns and totalweight * * @param singleLine true if the string should fit on one line * @return string representing this alignment */ public String toString(boolean singleLine) { long totalWeight = getTotalWeight(); StringBuilder builder = new StringBuilder(); builder.append(getClass().getSimpleName() + "(" + getID() + ")"); if (singleLine) { builder.append(": [taxa, patterns, sites] = [" + getTaxonCount() + ", " + getPatternCount()); builder.append(", " + getTotalWeight() + "]"); } else { long siteCount = getSiteCount(); builder.append('\n'); builder.append(" " + getTaxonCount() + " taxa"); builder.append('\n'); builder.append(" " + siteCount + (siteCount == 1 ? " site" : " sites") + (totalWeight == getSiteCount() ? "" : " with weight " + totalWeight + "")); builder.append('\n'); if (siteCount > 1) { builder.append(" " + getPatternCount() + " patterns"); builder.append('\n'); } } return builder.toString(); } public double[] getTipLikelihoods(int taxonIndex, int patternIndex_) { if (taxonIndex >= tipLikelihoods.size() || tipLikelihoods.get(taxonIndex) == null) { return null; } else { return tipLikelihoods.get(taxonIndex)[patternIndex_]; } } /** * returns an array containing the non-ambiguous states that this state represents. */ public boolean[] getStateSet(int state) { return m_dataType.getStateSet(state); // if (!isAmbiguousState(state)) { // boolean[] stateSet = new boolean[m_nMaxStateCount]; // stateSet[state] = true; // return stateSet; // } else { // } } boolean isAmbiguousState(int state) { return (state < 0 || state >= maxStateCount); } //Methods from AscertainedAlignment public Set<Integer> getExcludedPatternIndices() { return excludedPatterns; } public int getExcludedPatternCount() { return excludedPatterns.size(); } public double getAscertainmentCorrection(double[] patternLogProbs) { double excludeProb = 0, includeProb = 0, returnProb = 1.0; for (int i = 0; i < m_nIncluded.size(); i++) { includeProb += Math.exp(patternLogProbs[m_nIncluded.get(i)]); } for (int i : excludedPatterns) { excludeProb += Math.exp(patternLogProbs[i]); } if (includeProb == 0.0) { returnProb -= excludeProb; } else if (excludeProb == 0.0) { returnProb = includeProb; } else { returnProb = 1.0 + includeProb - excludeProb; } return Math.log(returnProb); } // getAscertainmentCorrection /** * Should not be used. No special order of taxa are assumed. Taxa order should be left to user input. */ @Deprecated static public void sortByTaxonName(List<Sequence> seqs) { Collections.sort(seqs, (Sequence o1, Sequence o2) -> { return o1.taxonInput.get().compareTo(o2.taxonInput.get()); } ); } /** * Get String representation of a sequence according to the current datatype * @param taxon the name of the taxon to get the sequence from in the alignment * @return sequence in String representation */ public String getSequenceAsString(String taxon) { int i = getTaxonIndex(taxon); // build up string from underlying data using the current datatype int [] states = new int[getSiteCount()]; for (int k = 0; k < getSiteCount(); k++) { int d = sitePatterns[patternIndex[k]][i]; states[k] = d; } String seq = null; try { seq = m_dataType.state2string(states); } catch (Exception e) { e.printStackTrace(); } return seq; } } // class Data