/* * @(#)SequenceUtil.java 1.0 September 2009 * * Copyright (c) 2009 Peter Troshin * * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.nbio.data.sequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility class for operations on sequences * * @author Peter Troshin * @version 1.0 * @since 3.0.2 */ public final class SequenceUtil { private static final Logger logger = LoggerFactory.getLogger(SequenceUtil.class); /** * A whitespace character: [\t\n\x0B\f\r] */ public static final Pattern WHITE_SPACE = Pattern.compile("\\s"); /** * A digit */ public static final Pattern DIGIT = Pattern.compile("\\d"); /** * Non word */ public static final Pattern NONWORD = Pattern.compile("\\W"); /** * Valid Amino acids */ public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYVUO]+", Pattern.CASE_INSENSITIVE); /** * inversion of AA pattern */ public static final Pattern NON_AA = Pattern.compile( "[^ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE); /** * Same as AA pattern but with one additional letters - X */ public static final Pattern AMBIGUOUS_AA = Pattern.compile( "[ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE); /** * Nucleotides a, t, g, c, u */ public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+", Pattern.CASE_INSENSITIVE); /** * Ambiguous nucleotide */ public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile( "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC /** * Non nucleotide */ public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+", Pattern.CASE_INSENSITIVE); private SequenceUtil() { } // utility class, no instantiation /* * public static void write_PirSeq(OutputStream os, FastaSequence seq) * throws IOException { BufferedWriter pir_out = new BufferedWriter(new * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + * SysPrefs.newlinechar); pir_out.close(); } * * public static void write_FastaSeq(OutputStream os, FastaSequence seq) * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + * SysPrefs.newlinechar); fasta_out.close(); } */ /** * @return true is the sequence contains only letters a,c, t, g, u */ public static boolean isNucleotideSequence(final FastaSequence s) { return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence()); } /** * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one * (!) - B char */ public static boolean isNonAmbNucleotideSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) { return false; /* * System.out.format("I found the text starting at " + * "index %d and ending at index %d.%n", nonDNAmatcher .start(), * nonDNAmatcher.end()); */ } final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence); return DNAmatcher.find(); } /** * Removes all whitespace chars in the sequence string * * @param sequence * @return cleaned up sequence */ public static String cleanSequence(String sequence) { assert sequence != null; final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence); sequence = m.replaceAll("").toUpperCase(); return sequence; } /** * Removes all special characters and digits as well as whitespace chars * from the sequence * * @param sequence * @return cleaned up sequence */ public static String deepCleanSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll(""); sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll(""); final Pattern othernonSeqChars = Pattern.compile("[_-]+"); sequence = othernonSeqChars.matcher(sequence).replaceAll(""); return sequence; } /** * * @param sequence * @return true is the sequence is a protein sequence, false overwise */ public static boolean isProteinSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { logger.info("Found non aa: {}", sequence); return false; } final Matcher protmatcher = SequenceUtil.AA.matcher(sequence); return protmatcher.find(); } /** * Check whether the sequence confirms to amboguous protein sequence * * @param sequence * @return return true only if the sequence if ambiguous protein sequence * Return false otherwise. e.g. if the sequence is non-ambiguous * protein or DNA */ public static boolean isAmbiguosProtein(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { return false; } if (SequenceUtil.AA.matcher(sequence).find()) { return false; } final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence); return amb_prot.find(); } /** * Writes list of FastaSequeces into the outstream formatting the sequence * so that it contains width chars on each line * * @param outstream * @param sequences * @param width * - the maximum number of characters to write in one line * @throws IOException */ public static void writeFasta(final OutputStream outstream, final List<FastaSequence> sequences, final int width) throws IOException { final OutputStreamWriter writer = new OutputStreamWriter(outstream); final BufferedWriter fastawriter = new BufferedWriter(writer); for (final FastaSequence fs : sequences) { fastawriter.write(fs.getFormatedSequence(width)); } outstream.flush(); fastawriter.close(); writer.close(); } /** * Reads fasta sequences from inStream into the list of FastaSequence * objects * * @param inStream * from * @return list of FastaSequence objects * @throws IOException */ public static List<FastaSequence> readFasta(final InputStream inStream) throws IOException { final List<FastaSequence> seqs = new ArrayList<FastaSequence>(); final BufferedReader infasta = new BufferedReader( new InputStreamReader(inStream, "UTF8"), 16000); final Pattern pattern = Pattern.compile("//s+"); String line; String sname = "", seqstr = null; do { line = infasta.readLine(); if ((line == null) || line.startsWith(">")) { if (seqstr != null) { seqs.add(new FastaSequence(sname.substring(1), seqstr)); } sname = line; // remove > seqstr = ""; } else { final String subseq = pattern.matcher(line).replaceAll(""); seqstr += subseq; } } while (line != null); infasta.close(); return seqs; } /** * Writes FastaSequence in the file, each sequence will take one line only * * @param os * @param sequences * @throws IOException */ public static void writeFasta(final OutputStream os, final List<FastaSequence> sequences) throws IOException { final OutputStreamWriter outWriter = new OutputStreamWriter(os); final BufferedWriter fasta_out = new BufferedWriter(outWriter); for (final FastaSequence fs : sequences) { fasta_out.write(fs.getOnelineFasta()); } fasta_out.close(); outWriter.close(); } }