package statalign.io.input.plugins; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.List; import statalign.io.RawSequences; import statalign.io.input.DataReader; import statalign.io.input.IllegalFormatException; /** * * Class to read files in FASTA format. * * @author novak * */ public class FastaReader extends DataReader { private static boolean[] allowedChars; static { allowedChars = new boolean[255]; for(int ch = 'a'; ch <= 'z'; ch++) allowedChars[ch] = true; for(int ch = 'A'; ch <= 'Z'; ch++) allowedChars[ch] = true; allowedChars['-'] = true; } @Override public List<String> supportedExtensions() { return Arrays.asList(new String[] { "fsa","fas","fasta" }); } /** * Reads the contents (aligned/non-aligned sequences) of the given data source in * Fasta format. * * @param reader Data source * @return RawSequences representation of the contents * @throws IOException if an I/O error occurs */ @Override public RawSequences read(Reader reader) throws IOException { RawSequences result = new RawSequences(); BufferedReader br = new BufferedReader(reader); String line; String name = null; boolean inSeq = false; StringBuilder actSeq = new StringBuilder(); boolean[] seen = new boolean['Z'-'A'+1]; while(true) { line = br.readLine(); if(line != null && line.length() == 0) continue; if(line == null || line.charAt(0) == '>') { if(inSeq) { if(actSeq.length() == 0) { throw new IllegalFormatException("FastaReader: empty sequence "+result.getSeqnames().get(result.getSeqnames().size()-1)); } else { result.add(name, actSeq.toString()); } } if(line == null) break; actSeq.setLength(0); inSeq = true; int start = 1, index; if((index = line.indexOf(' ', 1)) == 1) { index = line.indexOf(' ', 2); start = 2; } if(index == -1) line = line.substring(start); else{ //line = line.substring(start, index); line = line.substring(start); } line = line.replaceAll("[ \t]+", "_"); //line.replaceAll(" ", "_"); line = line.replaceAll("\\(", "{"); line = line.replaceAll("\\)", "}"); // System.out.println("new name: "+line); name = line; } else if(inSeq) { int len = line.length(); char ch; for(int i = 0; i < len; i++) { if(!Character.isWhitespace(ch = line.charAt(i))) { if(allowedChars[ch]) { actSeq.append(ch); if(ch != '-') seen[Character.toUpperCase(ch)-'A'] = true; } else { throw new IllegalFormatException("FastaReader: illegal character "+ch); } } } } else { throw new IllegalFormatException("FastaReader: data without sequence name"); } } if(!inSeq) throw new IllegalFormatException("FastaReader: empty file"); return result; } }