Utils.java example

Explorer
biojava-master
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.aaproperties;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.Set;

/**
 * This is a utility class that contains utility methods which will facilitates the coding of other methods
 *
 * @author kohchuanhock
 * @version 2011.08.22
 * @since 3.0.2
 */
public class Utils {

	private final static Logger logger = LoggerFactory.getLogger(Utils.class);

	/**
	 * Returns a value with the desired number of decimal places.
	 *
	 * @param d
	 * 		value to round
	 * @param c
	 * 		number of decimal places desired.
	 * 		Must be greater or equal to zero, otherwise, the given value d would be returned without any modification.
	 * @return
	 * 		a value with the given number of decimal places.
	 */
	public final static double roundToDecimals(double d, int c) {
		if(c < 0) return d;
		double p = Math.pow(10,c);
		d = d * p;
		double tmp = Math.round(d);
		return tmp/p;
	}

	/**
	 * Checks if given sequence contains invalid characters. Returns true if invalid characters are found, else return false.
	 * Note that any characters are deemed as valid only if it is found in cSet.
	 *
	 * @param sequence
	 * 		protein sequence to be check.
	 * @param cSet
	 * 		the set of characters that are deemed valid.
	 * @return
	 * 		true if invalid characters are found, else return false.
	 */
	public final static boolean doesSequenceContainInvalidChar(String sequence, Set<Character> cSet){
		for(char c:sequence.toCharArray()){
			if(!cSet.contains(c)) return true;
		}
		return false;
	}

	/**
	 * Return the number of invalid characters in sequence.
	 *
	 * @param sequence
	 * 		protein sequence to count for invalid characters.
	 * @param cSet
	 * 		the set of characters that are deemed valid.
	 * @param ignoreCase
	 * 		indicates if cases should be ignored
	 * @return
	 * 		the number of invalid characters in sequence.
	 */
	public final static int getNumberOfInvalidChar(String sequence, Set<Character> cSet, boolean ignoreCase){
		int total = 0;
		char[] cArray;
		if(ignoreCase) cArray = sequence.toUpperCase().toCharArray();
		else cArray = sequence.toCharArray();
		if(cSet == null) cSet = PeptideProperties.standardAASet;
		for(char c:cArray){
			if(!cSet.contains(c)) total++;
		}
		return total;
	}

	/**
	 * Returns a new sequence with all invalid characters being replaced by '-'.
	 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
	 *
	 * @param sequence
	 * 		protein sequence to be clean
	 * @param cSet
	 * 		user defined characters that are valid. Can be null. If null, then 20 standard protein amino acid codes will be considered as valid.
	 * @return
	 * 		a new sequence with all invalid characters being replaced by '-'.
	 */
	public final static String cleanSequence(String sequence, Set<Character> cSet){
		Set<Character> invalidCharSet = new HashSet<Character>();
		StringBuilder cleanSeq = new StringBuilder();
		if(cSet == null) cSet = PeptideProperties.standardAASet;
		for(char c:sequence.toCharArray()){
			if(!cSet.contains(c)){
				cleanSeq.append("-");
				invalidCharSet.add(c);
			}else{
				cleanSeq.append(c);
			}
		}

		// TODO: Should be StringJoiner once JDK8 used
		StringBuilder stringBuilder = new StringBuilder();
		for(char c: invalidCharSet){
			stringBuilder.append("\'" + c + "\'");
		}
		stringBuilder.deleteCharAt(stringBuilder.length()-1);
		stringBuilder.append(" are being replaced with '-'");
		logger.warn(stringBuilder.toString());

		return cleanSeq.toString();
	}

	/**
	 * Checks if the sequence contains invalid characters.
	 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
	 * If yes, it will return a new sequence where invalid characters are replaced with '-'.
	 * If no, it will simply return the input sequence.
	 *
	 * @param sequence
	 * 		protein sequence to be check for invalid characters.
	 * @return
	 * 		a sequence with no invalid characters.
	 */
	public static final String checkSequence(String sequence){
		return checkSequence(sequence, null);
	}

	/**
	 * Checks if the sequence contains invalid characters.
	 * Note that any character outside of the 20 standard protein amino acid codes are considered as invalid.
	 * If yes, it will return a new sequence where invalid characters are replaced with '-'.
	 * If no, it will simply return the input sequence.
	 *
	 * @param sequence
	 * 		protein sequence to be check for invalid characters.
	 * @param cSet
	 * 		character set which define the valid characters.
	 * @return
	 * 		a sequence with no invalid characters.
	 */
	public static final String checkSequence(String sequence, Set<Character> cSet){
		boolean containInvalid = false;
		if(cSet != null){
			containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, cSet);
		}else{
			containInvalid = sequence != null && doesSequenceContainInvalidChar(sequence, PeptideProperties.standardAASet);
		}
		if(containInvalid){
			String cSeq = cleanSequence(sequence, cSet);
			logger.warn("There exists invalid characters in the sequence. Computed results might not be precise.");
			logger.warn("To remove this warning: Please use org.biojava.nbio.aaproperties.Utils.cleanSequence to clean sequence.");

			return cSeq;
		}
		else{
			return sequence;
		}
	}
}