package statalign.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Dynamic array of raw sequence data. Sequences can be aligned, in which
* case the strings must be of equal length and the '-' character must
* represent the gaps.
*
* @author novak, miklos, aszabo
*
*/
public class RawSequences implements DataType {
/**
* Dynamic array of raw sequence data.
*/
private List<String> sequences = new ArrayList<String>();
public RawSequences getSeqs() { return this; }
public void setSeqs(RawSequences rs) { }
/**
* Dynamic array of sequence names. Any name can be null.
*/
private List<String> seqNames = new ArrayList<String>();
public List<String> getNames() { return seqNames; }
/**
* Sorted string of characters present in sequences. Does not contain the
* gap '-'.
*/
public String alphabet;
private int maxNameLength = -1;
public boolean perSequenceData() {
return false;
}
public String getSummaryAssociatedWith(String sequenceName) {
return sequenceName;
}
public void removeDataAssociatedWith(String sequenceName) {
}
public RawSequences() {
sequences = new ArrayList<String>();
seqNames = new ArrayList<String>();
}
public RawSequences(String seq, String name) {
sequences = new ArrayList<String>();
sequences.add(seq);
seqNames = new ArrayList<String>();
seqNames.add(name);
}
public boolean isAligned() {
int size;
if((size = sequences.size()) == 0)
return true; // or should it be false?
int len = sequences.get(0).length();
for(int i = 1; i < size; i++)
if(sequences.get(i).length() != len)
return false;
return true;
}
/**
* Returns the common length of the aligned sequences or -1 if sequences are
* unaligned (have different length).
*/
public int len() {
int size;
if ((size = sequences.size()) == 0)
return 0;
int len = sequences.get(0).length();
for (int i = 1; i < size; i++)
if (sequences.get(i).length() != len)
return -1;
return len;
}
public void add(String name, String sequence) {
add(name,sequence,false);
}
/**
* Add a name and a sequence, that belong together. If another sequence with
* the same name is already added then it will be updated.
*/
public void add(String name, String sequence, boolean overwrite) {
// if (name != null && seqNames.contains(name))
// throw new Error("Name collision! (" + name
// + "), please edit your input files to resolve it.");
if (seqNames.contains(name)) {
if (overwrite) {
sequences.set(seqNames.indexOf(name),sequence);
}
}
else {
seqNames.add(name);
sequences.add(sequence);
}
}
public void add(RawSequences more) {
add(more,false);
}
public void addOrReplace(RawSequences more) {
add(more,true);
}
public void add(RawSequences more, boolean overwrite) {
// so that names are checked
for (int i = 0; i < more.seqNames.size(); i++) {
add(more.seqNames.get(i), more.sequences.get(i),overwrite);
}
String alpha1 = getAlphabet();
String alpha2 = more.getAlphabet();
int len1 = alpha1.length(), len2 = alpha2.length();
if (len1 == 0) {
alphabet = alpha2;
maxNameLength = more.maxNameLength;
return;
}
if (len2 == 0)
return;
StringBuilder merged = new StringBuilder(len1 + len2);
int i = 0, j = 0;
char char1 = 0, char2 = 0;
while (i < len1 || j < len2) {
if (j == len2 || (i < len1 && (char1 = alpha1.charAt(i)) < (char2 = alpha2.charAt(j))))
merged.append(alpha1.charAt(i++));
else if (i == len1 || char1 > char2)
merged.append(alpha2.charAt(j++));
else {
merged.append(char1);
i++;
j++;
}
}
alphabet = merged.toString();
maxNameLength = -1;
}
public int getMaxNameLength() {
if (maxNameLength == -1) {
maxNameLength = 0;
for (String name : seqNames) {
if (name != null && name.length() > maxNameLength) {
maxNameLength = name.length();
}
}
}
return maxNameLength;
}
public String getAlphabet() {
if (alphabet == null) {
boolean present[] = new boolean[256];
for (String seq : sequences) {
for (int i = 0; i < seq.length(); i++) {
char ch = seq.charAt(i);
if (ch < 256)
present[Character.toUpperCase(ch)] = true;
}
}
StringBuilder aBuilder = new StringBuilder();
for (char ch = 'A'; ch <= 'Z'; ch++) {
if (present[ch])
aBuilder.append(ch);
}
alphabet = aBuilder.toString();
}
return alphabet;
}
// public void adjustNameLengths(){
// int maxLength = 0;
// for(int i = 0; i < seqNames.size(); i++){
// maxLength = Math.max(maxLength, seqNames.get(i).length());
// }
// ArrayList<String> newNames = new ArrayList<String>();
// for(int i = 0; i < seqNames.size(); i++){
// String temp = seqNames.get(i);
// while(temp.length() < maxLength){
// temp += " ";
// }
// newNames.add(temp);
// }
// seqNames = newNames;
// }
/**
* Returns the number of sequences.
*/
public int size() {
return seqNames.size();
}
public String getSeqName(int i) {
return seqNames.get(i);
}
public String getSequence(int i) {
return sequences.get(i);
}
public List<String> getSeqnames() {
return Collections.unmodifiableList(seqNames);
}
public List<String> getSequences() {
return Collections.unmodifiableList(sequences);
}
public String getSeqNamePadded(int i) {
String n = seqNames.get(i);
int pad = getMaxNameLength()-n.length();
if(pad > 0) {
char[] arr = new char[pad];
Arrays.fill(arr, ' ');
n += new String(arr);
}
return n;
}
/**
* Removes a sequence and its name.
*
* @return true on success
*/
public boolean remove(int i) {
if (size() > i) {
sequences.remove(i);
seqNames.remove(i);
maxNameLength = -1;
if(i==0) alphabet = null;
return true;
} else {
return false;
}
}
/**
* Removes a sequence and its name (given by name).
*
* @return true on success
*/
public boolean removeByName(String name) {
int ind = seqNames.indexOf(name);
if (ind == -1)
return false;
remove(ind);
return true;
}
/**
* Removes all sequences (and their names).
*/
public void clear() {
seqNames.clear();
sequences.clear();
alphabet = null;
maxNameLength = -1;
}
/**
* Removes all gaps from each sequence.
*/
public void removeGaps() {
for (int i = 0; i < sequences.size(); i++) {
String seq = sequences.get(i);
StringBuilder builder = new StringBuilder(seq.length());
for (int j = 0; j < seq.length(); j++) {
char ch = seq.charAt(j);
if (ch != '-')
builder.append(ch);
}
sequences.set(i, builder.toString());
}
}
@Override
public String toString() {
String s = "";
for (int i = 0; i < seqNames.size(); i++) {
s += ">" + seqNames.get(i) + "\n" + sequences.get(i) + "\n";
}
return s;
}
public boolean isRNA() {
for(int i = 0; i < alphabet.length(); i++) {
char letter = alphabet.charAt(i);
if(!(letter == 'A' || letter == 'C' || letter == 'G'
|| letter == 'U' || letter == 'T' || letter == 'W'
|| letter == 'S' || letter == 'R' || letter == 'Y'
|| letter == 'K' || letter == 'M' || letter == 'D'
|| letter == 'D' || letter == 'V' || letter == 'H'
|| letter == 'B' || letter == 'X' || letter == 'N')) {
return false;
}
}
return (alphabet.indexOf("U") != -1);
}
public static void main(String[] args) throws IOException {
RawSequences r1 = new RawSequences();
r1.sequences.add("LceIfhFil");
RawSequences r2 = new RawSequences();
r2.sequences.add("aeghijko");
r1.add(r2);
System.out.println(r1.alphabet);
}
}