package beast.evolution.datatype; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import beast.core.BEASTObject; import beast.core.Description; public interface DataType { final static public char GAP_CHAR = '-'; final static public char MISSING_CHAR = '?'; /** * @return number of states for this data type. * Assuming there is a finite number of states, or -1 otherwise. */ int getStateCount(); /** * Convert a sequence represented by a string into a sequence of integers * representing the state for this data type. * Ambiguous states should be represented by integer numbers higher than getStateCount() * throws exception when parsing error occur * */ List<Integer> string2state(String sequence); /** * Convert an array of states into a sequence represented by a string. * This is the inverse of string2state() * throws exception when State cannot be mapped * */ String state2string(List<Integer> states); String state2string(int[] states); /** * returns an array of length getStateCount() containing the (possibly ambiguous) states * that this state represents. */ public boolean[] getStateSet(int state); /** * returns an array with all non-ambiguous states represented by * a state. */ public int[] getStatesForCode(int state); boolean isAmbiguousState(int state); /** * true if the class is completely self contained and does not need any * further initialisation. Notable exception: GeneralDataype */ boolean isStandard(); /** * data type description, e.g. nucleotide, codon * */ public String getTypeDescription(); /** * Get character corresponding to a given state * * @param state state * <p/> * return corresponding character */ public char getChar(int state); /** * Get a string code corresponding to a given state. By default this * calls getChar but overriding classes may return multicharacter codes. * * @param state state * <p/> * return corresponding code */ public String getCode(int state); @Description(value = "Base class bringing class and interfaces together", isInheritable = false) public abstract class Base extends BEASTObject implements DataType { /** * size of the state space * */ protected int stateCount; /** * maps string encoding to state codes * */ protected String codeMap; public String getCodeMap() { return codeMap; } /** * length of the encoding, e.g. 1 for nucleotide, 3 for codons * */ protected int codeLength; /** * mapping codes to sets of states * */ protected int[][] mapCodeToStateSet; @Override public void initAndValidate() { if (mapCodeToStateSet != null) { if (mapCodeToStateSet.length != codeMap.length() / codeLength) { throw new IllegalArgumentException("codeMap and mapCodeToStateSet have incompatible lengths"); } } } @Override public int getStateCount() { return stateCount; } /** * implementation for single character per state encoding * */ @Override public List<Integer> string2state(String data) { List<Integer> sequence; sequence = new ArrayList<>(); // remove spaces data = data.replaceAll("\\s", ""); data = data.toUpperCase(); if (codeMap == null) { if (data.contains(",")) { // assume it is a comma separated string of integers String[] strs = data.split(","); for (String str : strs) { try { sequence.add(Integer.parseInt(str)); } catch (NumberFormatException e) { sequence.add(-1); } } } else { // assume it is a string where each character is a state for (byte c : data.getBytes()) { switch (c) { case GAP_CHAR: case MISSING_CHAR: sequence.add(-1); break; default: sequence.add(Integer.parseInt((char) c + "")); } } } } else { if (codeLength == 1) { // single character codes for (int i = 0; i < data.length(); i++) { char cCode = data.charAt(i); int stateCount = codeMap.indexOf(cCode); if (stateCount < 0) { throw new IllegalArgumentException("Unknown code found in sequence: " + cCode); } sequence.add(stateCount); } } else if (codeLength > 1) { // multi-character codes of fixed length // use code map to resolve state codes Map<String, Integer> map = new HashMap<>(); // fixed length code for (int i = 0; i < codeMap.length(); i += codeLength) { String code = codeMap.substring(i, i + codeLength); map.put(code, i / codeLength); } for (int i = 0; i < data.length(); i += codeLength) { String code = data.substring(i, i + codeLength).toUpperCase(); if (map.containsKey(code)) { sequence.add(map.get(code)); } else { throw new IllegalArgumentException("Unknown code found in sequence: " + code); } } } else { // variable length code of strings String[] codes = codeMap.toUpperCase().split(","); for (String code : data.split(",")) { boolean isFound = false; for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { if (code.equals(codes[codeIndex])) { sequence.add(codeIndex); isFound = true; break; } } if (!isFound) { throw new RuntimeException("Could not find code " + code + " in codemap"); } } } } return sequence; } // string2state @Override public String state2string(List<Integer> nrOfStates) { int[] nrOfStates2 = new int[nrOfStates.size()]; for (int i = 0; i < nrOfStates2.length; i++) { nrOfStates2[i] = nrOfStates.get(i); } return state2string(nrOfStates2); } /** * implementation for single character per state encoding * */ @Override public String state2string(int[] nrOfStates) { StringBuffer buf = new StringBuffer(); if (codeMap != null) { for (int state : nrOfStates) { String code = codeMap.substring(state * codeLength, state * codeLength + codeLength); buf.append(code); } } else { // produce a comma separated string of integers for (int i = 0; i < nrOfStates.length - 1; i++) { buf.append(nrOfStates[i] + ","); } buf.append(nrOfStates[nrOfStates.length - 1] + ""); } return buf.toString(); } // state2string @Override public int[] getStatesForCode(int state) { return mapCodeToStateSet[state]; } @Override public boolean[] getStateSet(int state) { boolean[] stateSet = new boolean[stateCount]; int[] stateNumbers = getStatesForCode(state); for (int i : stateNumbers) { stateSet[i] = true; } return stateSet; } // getStateSet /** Default implementations represent non-ambiguous states as numbers * 0 ... stateCount-1, and ambiguous characters as numbers >= stateCount * For data types that count something -- like microsattelites, or number * of lineages in SNAPP -- a stateCount < 0 represents missing data. */ @Override public boolean isAmbiguousState(int state) { return (state < 0 || state >= stateCount); } @Override public boolean isStandard() { return true; } @Override public char getChar(int state) { return (char) (state + 'A'); } @Override public String getCode(int state) { return String.valueOf(getChar(state)); } @Override public String toString() { return getTypeDescription(); } /** return state associated with a character */ public Integer char2state(String character) { return string2state(character).get(0); } } // class Base } // class DataType