Alignment.java example

Explorer
beast2-master
- src
/*
* File Alignment.java
*
* Copyright (C) 2010 Remco Bouckaert remco@cs.auckland.ac.nz
*
* This file is part of BEAST2.
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership and licensing.
*
* BEAST is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
*  BEAST is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with BEAST; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA  02110-1301  USA
*/
package beast.evolution.alignment;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import beast.core.Description;
import beast.core.Input;
import beast.core.Input.Validate;
import beast.core.parameter.Map;
import beast.core.util.Log;
import beast.evolution.datatype.DataType;
import beast.evolution.datatype.StandardData;
import beast.util.AddOnManager;

@Description("Class representing alignment data")
public class Alignment extends Map<String> {

    @Override
	protected Class<?> mapType() {
        return String.class;
    }

    /**
     * default data type *
     */
    protected final static String NUCLEOTIDE = "nucleotide";

    /**
     * directory to pick up data types from *
     */
    final static String[] IMPLEMENTATION_DIR = {"beast.evolution.datatype"};

    /**
     * list of data type descriptions, obtained from DataType classes *
     */
    static List<String> types = new ArrayList<>();

    static {
        findDataTypes();
    }

    static public void findDataTypes() {
        // build up list of data types
        List<String> m_sDataTypes = AddOnManager.find(beast.evolution.datatype.DataType.class, IMPLEMENTATION_DIR);
        for (String dataTypeName : m_sDataTypes) {
            try {
                DataType dataType = (DataType) Class.forName(dataTypeName).newInstance();
                if (dataType.isStandard()) {
                    String description = dataType.getTypeDescription();
                    if (!types.contains(description)) {
                        types.add(description);
                    }
                }
            } catch (Exception e) {
                // TODO: handle exception
            }
        }
    }

    final public Input<List<Sequence>> sequenceInput =
            new Input<>("sequence", "sequence and meta data for particular taxon", new ArrayList<>(), Validate.OPTIONAL);

    final public Input<TaxonSet> taxonSetInput =
            new Input<>("taxa", "An optional taxon-set used only to sort the sequences into the same order as they appear in the taxon-set.", new TaxonSet(), Validate.OPTIONAL);

    final public Input<Integer> stateCountInput = new Input<>("statecount", "maximum number of states in all sequences");
    final public Input<String> dataTypeInput = new Input<>("dataType", "data type, one of " + types, NUCLEOTIDE, types.toArray(new String[0]));
    final public Input<DataType.Base> userDataTypeInput = new Input<>("userDataType", "non-standard, user specified data type, if specified 'dataType' is ignored");
    final public Input<Boolean> stripInvariantSitesInput = new Input<>("strip", "sets weight to zero for sites that are invariant (e.g. all 1, all A or all unkown)", false);
    final public Input<String> siteWeightsInput = new Input<>("weights", "comma separated list of weights, one for each site in the sequences. If not specified, each site has weight 1");

    final public Input<Boolean> isAscertainedInput = new Input<>("ascertained", "is true if the alignment allows ascertainment correction, i.e., conditioning the " +
            "Felsenstein likelihood on excluding constant sites from the alignment", false);
    /**
     * Inputs from AscertainedAlignment
     */
    final public Input<Integer> excludefromInput = new Input<>("excludefrom", "first site to condition on, default 0", 0);
    final public Input<Integer> excludetoInput = new Input<>("excludeto", "last site to condition on (but excluding this site), default 0", 0);
    final public Input<Integer> excludeeveryInput = new Input<>("excludeevery", "interval between sites to condition on (default 1)", 1);

	public Input<Integer> m_includefrom = new Input<>("includefrom","first site to condition on, default 0", 0);
	public Input<Integer> m_includeto = new Input<>("includeto","last site to condition on, default 0", 0);
	public Input<Integer> m_includeevery = new Input<>("includeevery","interval between sites to condition on (default 1)", 1);

    /**
     * list of sequences in the alignment *
     */
    protected List<Sequence> sequences = new ArrayList<>();

    /**
     * list of taxa names defined through the sequences in the alignment *
     */
    protected List<String> taxaNames = new ArrayList<>();

    /**
     * list of state counts for each of the sequences, typically these are
     * constant throughout the whole alignment.
     */
    protected List<Integer> stateCounts = new ArrayList<>();

    /**
     * maximum of m_nStateCounts *
     */
    protected int maxStateCount;

    /**
     * state codes for the sequences *
     */
    protected List<List<Integer>> counts = new ArrayList<>();

    /**
     * data type, useful for converting String sequence to Code sequence, and back *
     */
    protected DataType m_dataType;

    /**
     * weight over the columns of a matrix *
     */
    protected int[] patternWeight;

    /**
     * weights of sites -- assumed 1 for each site if not specified
     */
    protected int[] siteWeights = null;

    /**
     * Probabilities associated with each tip of the tree, for use when the
     * characters are uncertain.
     */
    public List<double[][]> tipLikelihoods = new ArrayList<>(); // #taxa x #sites x #states
    private boolean usingTipLikelihoods = false;
    
    /**
     * pattern state encodings *
     */
    protected int [][] sitePatterns; // #patterns x #taxa

    /**
     * maps site nr to pattern nr *
     */
    protected int[] patternIndex;

    /**
     * From AscertainedAlignment
     */
    Set<Integer> excludedPatterns;
	List<Integer> m_nIncluded;

    /**
     * A flag to indicate if the alignment is ascertained
     */
    public boolean isAscertained;

    public Alignment() {
    }

    /**
     * Constructor for testing purposes.
     *
     * @param sequences
     * @param stateCount
     * @param dataType
     * @deprecated This is the deprecated legacy form and will be removed
     * at some point. Use {@link #Alignment(List, String)} instead.
     */
    @Deprecated
    public Alignment(List<Sequence> sequences, Integer stateCount, String dataType) {
        this(sequences, dataType);
    }

    /**
     * Constructor for testing purposes.
     *
     * @param sequences
     * @param dataType
     */
    public Alignment(List<Sequence> sequences, String dataType) {
        for (Sequence sequence : sequences) {
            sequenceInput.setValue(sequence, this);
        }
        dataTypeInput.setValue(dataType, this);
        initAndValidate();
    }

    @Override
    public void initAndValidate() {

        if (sequenceInput.get().size() == 0 && defaultInput.get().size() == 0) {
            throw new IllegalArgumentException("Either a sequence input must be specified, or a map of strings must be specified");
        }

        if (siteWeightsInput.get() != null) {
            String str = siteWeightsInput.get().trim();
            String[] strs = str.split(",");
            siteWeights = new int[strs.length];
            for (int i = 0; i < strs.length; i++) {
                siteWeights[i] = Integer.parseInt(strs[i].trim());
            }
        }

        // determine data type, either user defined or one of the standard ones
        if (userDataTypeInput.get() != null) {
            m_dataType = userDataTypeInput.get();
        } else {
            if (types.indexOf(dataTypeInput.get()) < 0) {
                throw new IllegalArgumentException("data type + '" + dataTypeInput.get() + "' cannot be found. " +
                        "Choose one of " + Arrays.toString(types.toArray(new String[0])));
            }
            // seems to spend forever in there??
            List<String> dataTypes = AddOnManager.find(beast.evolution.datatype.DataType.class, IMPLEMENTATION_DIR);
            for (String dataTypeName : dataTypes) {
                DataType dataType;
				try {
					dataType = (DataType) Class.forName(dataTypeName).newInstance();
	                if (dataTypeInput.get().equals(dataType.getTypeDescription())) {
	                    m_dataType = dataType;
	                    break;
	                }
				} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
					throw new IllegalArgumentException(e.getMessage());
				}
            }
        }

        // initialize the sequence list
        if (sequenceInput.get().size() > 0) {
            sequences = sequenceInput.get();
        } else {
            // alignment defined by a map of id -> sequence
            List<String> taxa = new ArrayList<>();
            taxa.addAll(map.keySet());
            sequences.clear();
            for (String key : taxa) {
                String sequence = map.get(key);
                sequences.add(new Sequence(key, sequence));
            }
        }

        // initialize the alignment from the given list of sequences
        initializeWithSequenceList(sequences, true);

        if (taxonSetInput.get() != null && taxonSetInput.get().getTaxonCount() > 0) {
            sortByTaxonSet(taxonSetInput.get());
        }
        Log.info.println(toString(false));
    }

    /**
     * Initializes the alignment given the provided list of sequences and no other information.
     * It site weights and/or data type have been previously set up with initAndValidate then they
     * remain in place. This method is used mainly to re-order the sequences to a new taxon order
     * when an analysis of multiple alignments on the same taxa are undertaken.
     *
     * @param sequences
     */
    private void initializeWithSequenceList(List<Sequence> sequences, boolean log) {
        this.sequences = sequences;
        taxaNames.clear();
        stateCounts.clear();
        counts.clear();
        try {
            for (Sequence seq : sequences) {

                counts.add(seq.getSequence(m_dataType));
                if (taxaNames.contains(seq.getTaxon())) {
                    throw new RuntimeException("Duplicate taxon found in alignment: " + seq.getTaxon());
                }
                taxaNames.add(seq.getTaxon());
                tipLikelihoods.add(seq.getLikelihoods());
                // if seq.isUncertain() == false then the above line adds 'null'
	            // to the list, indicating that this particular sequence has no tip likelihood information
                usingTipLikelihoods |= (seq.getLikelihoods() != null);	            

                if (seq.totalCountInput.get() != null) {
                    stateCounts.add(seq.totalCountInput.get());
                } else {
                    stateCounts.add(m_dataType.getStateCount());
                }
            }
            if (counts.size() == 0) {
                // no sequence data
                throw new RuntimeException("Sequence data expected, but none found");
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        sanityCheckCalcPatternsSetUpAscertainment(log);
    }

    /**
     * Checks that sequences are all the same length, calculates patterns and sets up ascertainment.
     */
    private void sanityCheckCalcPatternsSetUpAscertainment(boolean log) {
        // Sanity check: make sure sequences are of same length
        int length = counts.get(0).size();
        if (!(m_dataType instanceof StandardData)) {
            for (List<Integer> seq : counts) {
                if (seq.size() != length) {
                    throw new RuntimeException("Two sequences with different length found: " + length + " != " + seq.size());
                }
            }
        }
        if (siteWeights != null && siteWeights.length != length) {
            throw new RuntimeException("Number of weights (" + siteWeights.length + ") does not match sequence length (" + length + ")");
        }

        calcPatterns(log);
        setupAscertainment();
    }

    /**
     * Sorts an alignment by a provided TaxonSet, so that the sequence/taxon pairs in the alignment match the order
     * that the taxa appear in the TaxonSet (i.e. not necessarily alphabetically).
     *
     * @param toSortBy the taxon set that species the order on the taxa.
     */
    public void sortByTaxonSet(TaxonSet toSortBy) {

        List<Sequence> sortedSeqs = new ArrayList<>();
        sortedSeqs.addAll(sequences);
        Collections.sort(sortedSeqs, (Sequence o1, Sequence o2) -> {
                return Integer.compare(toSortBy.getTaxonIndex(o1.getTaxon()), toSortBy.getTaxonIndex(o2.getTaxon()));
            }
        );
        initializeWithSequenceList(sortedSeqs, false);
    }

    void setupAscertainment() {
        isAscertained = isAscertainedInput.get();

        if (isAscertained) {
            //From AscertainedAlignment
            int from = excludefromInput.get();
            int to = excludetoInput.get();
            int every = excludeeveryInput.get();
            excludedPatterns = new HashSet<>();
            for (int i = from; i < to; i += every) {
                int patternIndex_ = patternIndex[i];
                // reduce weight, so it does not confuse the tree likelihood
                patternWeight[patternIndex_] = 0;
                excludedPatterns.add(patternIndex_);
            }
            
    		from = m_includefrom.get();
    		to = m_includeto.get();
    		every = m_includeevery.get();
    		m_nIncluded = new ArrayList<>();
    		for (int i = from; i < to; i += every) {
    			int patternIndex_ = patternIndex[i];
    			// reduce weight, so it does not confuse the tree likelihood
    			patternWeight[patternIndex_] = 0;
    			m_nIncluded.add(patternIndex_);
    		}

        } else {
        	// sanity check
            int from = excludefromInput.get();
            int to = excludetoInput.get();
            if (from != excludefromInput.defaultValue || to != excludetoInput.defaultValue) {
            	Log.warning.println("WARNING: excludefrom or excludeto is specified, but 'ascertained' flag is not set to true");
            	Log.warning.println("WARNING: to suppress this warning, remove the excludefrom or excludeto attributes (if no astertainment correction is required)");
            	Log.warning.println("WARNING: or set the 'ascertained' flag to true on element with id=" + getID());
            }
        }

    } // initAndValidate

    static String getSequence(Alignment data, int taxonIndex) {

        int[] states = new int[data.getPatternCount()];
        for (int i = 0; i < data.getPatternCount(); i++) {
            int[] sitePattern = data.getPattern(i);
            states[i] = sitePattern[taxonIndex];
        }
        try {
            return data.getDataType().state2string(states);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
        return null;
    }


    /*
     * assorted getters and setters *
     */
    public List<String> getTaxaNames() {
        if (taxaNames.size() == 0) {
            try {
                initAndValidate();
            } catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException(e);
            }
        }
        return taxaNames;
    }

    public List<Integer> getStateCounts() {
        return stateCounts;
    }

    /**
     * Returns a List of Integer Lists where each Integer List represents
     * the sequence corresponding to a taxon.  The taxon is identified by
     * the position of the Integer List in the outer List, which corresponds
     * to the nodeNr of the corresponding leaf node and the position of the
     * taxon name in the taxaNames list.
     *
     * @return integer representation of sequence alignment
     */
    public List<List<Integer>> getCounts() {
        return counts;
    }

    public DataType getDataType() {
        return m_dataType;
    }

    /**
     * @return number of taxa in Alignment.
     */
    public int getTaxonCount() {
        //if (taxonsetInput.get() != null) {
        //	return taxonsetInput.get().getTaxonCount();
        //}
        return taxaNames.size();
    }

    /**
     * @return number of taxa in Alignment.
     * @deprecated Use getTaxonCount() instead.
     */
    @Deprecated
    public int getNrTaxa() {
        return getTaxonCount();
    }

    public int getTaxonIndex(String id) {
        return taxaNames.indexOf(id);
    }

    /**
     * @return Number of unique character patterns in alignment.
     */
    public int getPatternCount() {
        return sitePatterns.length;
    }

    public int[] getPattern(int patternIndex_) {
        return sitePatterns[patternIndex_];
    }

    public int getPattern(int taxonIndex, int patternIndex_) {
        return sitePatterns[patternIndex_][taxonIndex];
    }

    /**
     * Retrieve the "weight" of a particular pattern: the number of sites
     * having that pattern.
     *
     * @param patternIndex_ Index into pattern array.
     * @return pattern weight
     */
    public int getPatternWeight(int patternIndex_) {
        return patternWeight[patternIndex_];
    }

    public int getMaxStateCount() {
        return maxStateCount;
    }

    /**
     * Retrieve index of pattern corresponding to a particular site.
     *
     * @param site Index of site.
     * @return Index of pattern.
     */
    public int getPatternIndex(int site) {
        return patternIndex[site];
    }

    /**
     * @return Total number of sites in alignment.
     */
    public int getSiteCount() {
        return patternIndex.length;
    }

    /**
     * Retrieve an array containing the number of times each character pattern
     * occurs in the alignment.
     *
     * @return Pattern weight array.
     */
    public int[] getWeights() {
        return patternWeight;
    }


    /**
     * SiteComparator is used for ordering the sites,
     * which makes it easy to identify patterns.
     */
    class SiteComparator implements Comparator<int[]> {
        @Override
		public int compare(int[] o1, int[] o2) {
            for (int i = 0; i < o1.length; i++) {
                if (o1[i] > o2[i]) {
                    return 1;
                }
                if (o1[i] < o2[i]) {
                    return -1;
                }
            }
            return 0;
        }
    } // class SiteComparator


    protected void calcPatterns() {
        calcPatterns(true);
    }

        /**
         * calculate patterns from sequence data
         * *
         */
    private void calcPatterns(boolean log) {
        int taxonCount = counts.size();
        int siteCount = counts.get(0).size();

        // convert data to transposed int array
        int[][] data = new int[siteCount][taxonCount];
        for (int i = 0; i < taxonCount; i++) {
            List<Integer> sites = counts.get(i);
            for (int j = 0; j < siteCount; j++) {
                data[j][i] = sites.get(j);
            }
        }

        // sort data
        SiteComparator comparator = new SiteComparator();
        Arrays.sort(data, comparator);

        // count patterns in sorted data
        // if (siteWeights != null) the weights are recalculated below
        int patterns = 1;
        int[] weights = new int[siteCount];
        weights[0] = 1;
        for (int i = 1; i < siteCount; i++) {
            if (usingTipLikelihoods || comparator.compare(data[i - 1], data[i]) != 0) {
            	// In the case where we're using tip probabilities, we need to treat each 
            	// site as a unique pattern, because it could have a unique probability vector.
                patterns++;
                data[patterns - 1] = data[i];
            }
            weights[patterns - 1]++;
        }

        // reserve memory for patterns
        patternWeight = new int[patterns];
        sitePatterns = new int[patterns][taxonCount];
        for (int i = 0; i < patterns; i++) {
            patternWeight[i] = weights[i];
            sitePatterns[i] = data[i];
        }

        // find patterns for the sites
        patternIndex = new int[siteCount];
        for (int i = 0; i < siteCount; i++) {
            int[] sites = new int[taxonCount];
            for (int j = 0; j < taxonCount; j++) {
                sites[j] = counts.get(j).get(i);
            }
            patternIndex[i] = Arrays.binarySearch(sitePatterns, sites, comparator);
        }

        if (siteWeights != null) {
            Arrays.fill(patternWeight, 0);
            for (int i = 0; i < siteCount; i++) {
                patternWeight[patternIndex[i]] += siteWeights[i];
            }
        }

        // determine maximum state count
        // Usually, the state count is equal for all sites,
        // though for SnAP analysis, this is typically not the case.
        maxStateCount = 0;
        for (int m_nStateCount1 : stateCounts) {
            maxStateCount = Math.max(maxStateCount, m_nStateCount1);
        }
        // report some statistics
        if (log && taxaNames.size() < 30) {
            for (int i = 0; i < taxaNames.size(); i++) {
                Log.info.println(taxaNames.get(i) + ": " + counts.get(i).size() + " " + stateCounts.get(i));
            }
        }

        if (stripInvariantSitesInput.get()) {
            // don't add patterns that are invariant, e.g. all gaps
            if (log) Log.info.println("Stripping invariant sites");

            int removedSites = 0;
            for (int i = 0; i < patterns; i++) {
                int[] pattern = sitePatterns[i];
                int value = pattern[0];
                boolean isInvariant = true;
                for (int k = 1; k < pattern.length; k++) {
                    if (pattern[k] != value) {
                        isInvariant = false;
                        break;
                    }
                }
                if (isInvariant) {
                    removedSites += patternWeight[i];
                    patternWeight[i] = 0;

                    if (log) Log.info.print(" <" + value + "> ");
                }
            }
            if (log) Log.info.println(" removed " + removedSites + " sites ");
        }
    } // calcPatterns

    /**
     * @return the total weight of all the patterns (this is the effective number of sites)
     */
    private long getTotalWeight() {
        long totalWeight = 0;
        for (int weight : patternWeight) {
            totalWeight += weight;
        }
        return totalWeight;
    }

    /**
     * Pretty printing of vital statistics of an alignment including id, #taxa, #sites, #patterns and totalweight
     *
     * @param singleLine true if the string should fit on one line
     * @return string representing this alignment
     */
    public String toString(boolean singleLine) {
        long totalWeight = getTotalWeight();
        StringBuilder builder = new StringBuilder();
        builder.append(getClass().getSimpleName() + "(" + getID() + ")");

        if (singleLine) {
            builder.append(": [taxa, patterns, sites] = [" + getTaxonCount() + ", " + getPatternCount());
            builder.append(", " + getTotalWeight() + "]");
        } else {

            long siteCount = getSiteCount();

            builder.append('\n');
            builder.append("  " + getTaxonCount() + " taxa");
            builder.append('\n');
            builder.append("  " + siteCount + (siteCount == 1 ? " site" : " sites") + (totalWeight == getSiteCount() ? "" : " with weight " + totalWeight + ""));
            builder.append('\n');
            if (siteCount > 1) {
                builder.append("  " + getPatternCount() + " patterns");
                builder.append('\n');
            }
        }
        return builder.toString();
    }

    public double[] getTipLikelihoods(int taxonIndex, int patternIndex_) {
    	if (taxonIndex >= tipLikelihoods.size() || tipLikelihoods.get(taxonIndex) == null) { 
    		return null; 
    	} else { 
    		return tipLikelihoods.get(taxonIndex)[patternIndex_];
    	}
    	
    }
    /**
     * returns an array containing the non-ambiguous states that this state represents.
     */
    public boolean[] getStateSet(int state) {
        return m_dataType.getStateSet(state);
//        if (!isAmbiguousState(state)) {
//            boolean[] stateSet = new boolean[m_nMaxStateCount];
//            stateSet[state] = true;
//            return stateSet;
//        } else {
//        }
    }

    boolean isAmbiguousState(int state) {
        return (state < 0 || state >= maxStateCount);
    }

    //Methods from AscertainedAlignment
    public Set<Integer> getExcludedPatternIndices() {
        return excludedPatterns;
    }

    public int getExcludedPatternCount() {
        return excludedPatterns.size();
    }

	public double getAscertainmentCorrection(double[] patternLogProbs) {
        double excludeProb = 0, includeProb = 0, returnProb = 1.0;

        for (int i = 0; i < m_nIncluded.size(); i++) {
        	includeProb += Math.exp(patternLogProbs[m_nIncluded.get(i)]);
        }

        for (int i : excludedPatterns) {
            excludeProb += Math.exp(patternLogProbs[i]);
        }

        if (includeProb == 0.0) {
            returnProb -= excludeProb;
        } else if (excludeProb == 0.0) {
            returnProb = includeProb;
        } else {
            returnProb = 1.0 + includeProb - excludeProb;
        }
        return Math.log(returnProb);
    } // getAscertainmentCorrection

    /**
     * Should not be used. No special order of taxa are assumed. Taxa order should be left to user input.
     */
    @Deprecated
    static public void sortByTaxonName(List<Sequence> seqs) {
        Collections.sort(seqs, (Sequence o1, Sequence o2) -> {
                return o1.taxonInput.get().compareTo(o2.taxonInput.get());
            }
        );
    }

    /** 
     * Get String representation of a sequence according to the current datatype
     * @param taxon the name of the taxon to get the sequence from in the alignment
     * @return sequence in String representation
     */
	public String getSequenceAsString(String taxon) {
		int i = getTaxonIndex(taxon);		

		// build up string from underlying data using the current datatype
		int [] states = new int[getSiteCount()];
		for (int k = 0; k < getSiteCount(); k++) {
			int d = sitePatterns[patternIndex[k]][i];
			states[k] = d;
		}
		String seq = null;
		try {
			seq = m_dataType.state2string(states);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return seq;
	}
} // class Data