/* * @author Ahmed Moustafa (ahmed at users.sourceforge.net) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package jaligner.util; import jaligner.Sequence; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; /** * SequenceParser to sequences from different formats. * <br> * Currently the supported formats are: * <ul> * <li>Plain sequence</li> * <li><a href="http://www.ncbi.nlm.nih.gov/BLAST/fasta.html">Sequence</a></li> * </ul> * * @author Ahmed Moustafa (ahmed@users.sf.net) */ public class SequenceParser { /** * Logger */ private static final Logger logger = Logger.getLogger(SequenceParser.class.getName()); /** * Returns a parsed Sequence from a string. * @param sequence string to parse * @return parsed sequence * @throws SequenceParserException * @see Sequence */ public static Sequence parse (String sequence) throws SequenceParserException { if (sequence == null) { throw new SequenceParserException ( "Null sequence" ); } if (sequence.trim().length() == 0) { throw new SequenceParserException ( "Empty sequence" ); } sequence = sequence.replaceAll("\r\n", "\n"); String sequenceName = null; String sequenceDescription = null; if (sequence.startsWith(">")) { // FASTA format int index = sequence.indexOf("\n"); if (index == -1) { throw new SequenceParserException ( "Invalid sequence format" ); } String first = sequence.substring(1, index); sequence = sequence.substring(index); index = 0; for (int i = 0; i < first.length() && first.charAt(i) != ' ' && first.charAt(i) != '\t'; i++, index++) { // Skip white spaces } sequenceName = first.substring(0, index); StringTokenizer stringTokenizer = new StringTokenizer(sequenceName, "|"); while (stringTokenizer.hasMoreTokens()) { sequenceName = stringTokenizer.nextToken(); } sequenceDescription = index + 1 > first.length() ? "" : first.substring(index + 1); } else { // Plain format ... nothing to do here } Sequence s = new Sequence(prepare(sequence), sequenceName, sequenceDescription, Sequence.PROTEIN); return s; } /** * Returns a Sequence parsed and loaded from a file * @param file to parse * @return parsed sequence * @throws SequenceParserException * @see Sequence */ public static Sequence parse (File file) throws SequenceParserException { String sequenceName = null; String sequenceDescription = null; BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); StringBuffer buffer = new StringBuffer(); // Read & parse the first line String line = reader.readLine(); if (line.startsWith(">")) { // FASTA sequence line = line.substring(1).trim(); int index = 0; for (int i = 0; i < line.length() && line.charAt(i) != ' ' && line.charAt(i) != '\t'; i++, index++) { // Skip white spaces } sequenceName = line.substring(0, index); StringTokenizer stringTokenizer = new StringTokenizer(sequenceName, "|"); while (stringTokenizer.hasMoreTokens()) { sequenceName = stringTokenizer.nextToken(); } sequenceDescription = index + 1 > line.length() ? "" : line.substring(index + 1); } else { // Plain sequence buffer.append(prepare(line)); } // Read the remaining the file (the actual sequence) while ((line = reader.readLine()) != null) { buffer.append(prepare(line)); } reader.close(); Sequence s = new Sequence(buffer.toString(), sequenceName, sequenceDescription, Sequence.PROTEIN); return s; } catch (Exception e) { throw new SequenceParserException(e.getMessage()); } finally { if (reader != null) { try { reader.close(); } catch (Exception silent) { logger.log(Level.WARNING, "Failed closing reader: " + silent.getMessage(), silent); } } } } /** * Removes whitespaces from a sequence and validates other characters. * @param sequence sequence to be prepared * @return prepared array of characters * @throws SequenceParserException */ private static String prepare (String sequence) throws SequenceParserException { StringBuffer buffer = new StringBuffer(); String copy = sequence.trim().toUpperCase(); for (int i = 0, n = copy.length(); i < n; i++) { switch ( copy.charAt(i) ) { // skip whitespaces case 9: case 10: case 13: case 32: break; // add a valid character case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'K': case 'L': case 'M': case 'N': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'Y': case 'Z': case 'X': case '-': case '*': buffer.append(copy.charAt(i)); break; // throw an exception for anything else default: throw new SequenceParserException( "Invalid sequence character: '" + copy.charAt(i) + "'"); } } return buffer.toString(); } }