/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.bio; import static fr.ens.biologie.genomique.eoulsan.util.Utils.newArrayList; import static fr.ens.biologie.genomique.eoulsan.util.Utils.newHashSet; import static java.lang.Math.log10; import static java.lang.Math.max; import static java.lang.Math.pow; import static java.lang.Math.round; import static java.util.Collections.EMPTY_SET; import static java.util.Collections.unmodifiableSet; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import fr.ens.biologie.genomique.eoulsan.bio.io.FastqReader; /** * This enum define the existing fastq formats. It provide many function to * transform a format to another. * @since 1.1 * @author Laurent Jourdren */ public enum FastqFormat { FASTQ_SANGER("fastq-sanger", new String[] {"sanger", "fastq-illumina-1.8", "illumina-1.8", "1.8"}, "1.8", 0, 93, 40, 33, true), FASTQ_SOLEXA("fastq-solexa", new String[] {"solexa", "fastq-solexa-1.0", "solexa-1.0", "1.0"}, "1.0", -5, 62, 40, 64, false), FASTQ_ILLUMINA("fastq-illumina-1.3", new String[] {"fastq-illumina", "illumina", "illumina-1.3", "1.3"}, "1.3", 0, 62, 40, 64, true), FASTQ_ILLUMINA_1_5("fastq-illumina-1.5", new String[] {"illumina-1.5", "1.5"}, "1.5", 2, 62, 40, 64, true); private final String name; private final Set<String> alias; private final String illuminaVersion; private final int scoreMin; private final int scoreMax; private final int scoreMaxExpected; private final int asciiOffset; private final boolean phredScore; // // Getters // /** * Get the name of the fastq format. * @return the name of the format */ public String getName() { return this.name; } /** * Get the alias for the name of this format. * @return a set of string with the alias names */ @SuppressWarnings("unchecked") public Set<String> getAlias() { if (this.alias == null) { return EMPTY_SET; } return unmodifiableSet(this.alias); } /** * Get the first version of the solexa/illumina tools that can have generated * a file with this format. * @return a string with the format version */ public String getIlluminaVersion() { return this.illuminaVersion; } /** * Get the minimal value of the quality score. * @return the minimal value of the quality score */ public int getScoreMin() { return this.scoreMin; } /** * Get the maximal value of the quality score. * @return the maximal value of the quality score */ public int getScoreMax() { return this.scoreMax; } /** * Get the ASCII offset. * @return the ASCII offset */ public int getAsciiOffset() { return this.asciiOffset; } /** * Get the maximal expected value of the quality score. * @return the maximal expected value of the quality score */ public int getScoreMaxExpected() { return this.scoreMaxExpected; } /** * Test if the format use Phred quality score. * @return true if the format Phred quality score */ public boolean isPhredScore() { return this.phredScore; } // // Other methods // /** * Get the minimal ASCII character used to represent the quality score. * @return an ASCII character */ public char getCharMin() { return (char) (this.asciiOffset + this.scoreMin); } /** * Get the maximal ASCII character used to represent the quality score. * @return an ASCII character */ public char getCharMax() { return (char) (this.asciiOffset + this.scoreMax); } /** * Get the maximal ASCII character used to represent the quality score. * @return an ASCII character */ public char getCharMaxExpected() { return (char) (this.asciiOffset + this.scoreMaxExpected); } /** * Test if a character is valid to represent the quality. * @param c the character to test * @return true if the character if valid */ public boolean isCharValid(final char c) { return c >= getCharMin() && c <= getCharMax(); } /** * Test if all the character of a string are valid to represent the quality. * @param s the string to test * @return -1 if all the characters of the string are valid of the value of * the first invalid character */ public int findInvalidChar(final String s) { if (s == null) { throw new NullPointerException(); } final int len = s.length(); for (int i = 0; i < len; i++) { final char c = s.charAt(i); if (!isCharValid(c)) { return c; } } return -1; } /** * Convert a character to a quality number. * @param character character to convert * @return a quality score */ public int getScore(final char character) { return character - this.asciiOffset; } /** * Convert a character to an error probability. * @param character character to convert * @return a quality score */ public double getProbability(final char character) { return convertScoreToProbability(character - this.asciiOffset); } /** * Convert a quality score to a probability. * @param score the quality score * @return the probability that correspond to the quality score */ public double convertScoreToProbability(final int score) { if (this.phredScore) { return pow(10.0, (double) score / -10.0); } return 1.0 / ((1.0 / pow(10.0, -score / 10.0)) + 1.0); } /** * Convert a probability to a quality score. * @param p probability to convert * @return a quality score */ public double convertProbabilityToScore(final double p) { if (this.phredScore) { return -10.0 * log10(p); } return -10.0 * log10(p / (1 - p)); } /** * Convert Solexa quality score to Phred quality score. The formula is taken * from BioPython. * @param solexaScore the input quality score * @return the quality converted to Phred quality score */ public static double convertSolexaScoreToPhredScore(final int solexaScore) { if (solexaScore < -5) { throw new IllegalArgumentException( "Invalid Solexa quality: " + solexaScore); } return 10.0 * log10(pow(10, solexaScore / 10.0) + 1); } /** * Convert Phred quality score to Solexa quality score. The formula is taken * from BioPython. * @param phredScore the input quality score * @return the quality converted to Solexa */ public static double convertPhredSCoreToSolexaScore(final int phredScore) { if (phredScore == 0) { return -5.0; } if (phredScore > 0) { return max(-5.0, 10.0 * log10(pow(10, phredScore / 10.0) - 1)); } throw new IllegalArgumentException("Invalid PHRED quality: " + phredScore); } /** * Convert a quality score string from a format to another. * @param quality quality string to convert * @param format output format * @return the converted quality string */ public String convertTo(final String quality, final FastqFormat format) { if (quality == null) { return null; } final StringBuilder sb = new StringBuilder(quality.length()); for (char c : quality.toCharArray()) { sb.append(convertTo(c, format)); } return sb.toString(); } /** * Convert a quality score character from a format to another. * @param character character to convert * @param format output format * @return the converted character */ public char convertTo(final char character, final FastqFormat format) { return (char) (format.asciiOffset + convertScoreTo(getScore(character), format)); } /** * Convert quality from a format to another. * @param score quality score to transform * @param format output format * @return a converted quality score */ public int convertScoreTo(final int score, final FastqFormat format) { if (this.isPhredScore() != format.isPhredScore()) { if (isPhredScore()) { return (int) round(convertPhredSCoreToSolexaScore(score)); } return (int) round(convertSolexaScoreToPhredScore(score)); } return score; } /** * Get a format from its name or its alias. * @param name name of the format to get * @return the format or null if no format was found */ public static FastqFormat getFormatFromName(final String name) { if (name == null) { return null; } final String lowerName = name.toLowerCase().trim(); for (FastqFormat format : FastqFormat.values()) { if (format.getName().toLowerCase().equals(lowerName)) { return format; } if (format.alias != null && format.alias.contains(lowerName)) { return format; } } return null; } /** * Identify the fastq format used in a Fastq file. * @param is input stream * @return The FastqFormat found or null if no format was found * @throws IOException if an error occurs while reading the fastq stream * @throws BadBioEntryException if bad fastq entry is found */ public static FastqFormat identifyFormat(final InputStream is) throws IOException, BadBioEntryException { return identifyFormat(is, -1); } /** * Identify the fastq format used in a Fastq file. * @param is input stream * @param maxEntriesToRead maximal entries of the file to read. If this value * is lower than 1 all the entries of the stream are read * @return The FastqFormat found or null if no format was found * @throws IOException if an error occurs while reading the fastq stream * @throws BadBioEntryException if bad fastq entry is found */ public static FastqFormat identifyFormat(final InputStream is, final int maxEntriesToRead) throws IOException, BadBioEntryException { if (is == null) { throw new NullPointerException("The input format is null"); } final FastqReader reader = new FastqReader(is); final Set<FastqFormat> formats = newHashSet(Arrays.asList(FastqFormat.values())); int count = 0; final int[] range = new int[] {Integer.MAX_VALUE, Integer.MIN_VALUE}; for (final ReadSequence read : reader) { if (maxEntriesToRead > 0 && count > maxEntriesToRead) { break; } removeBadFormats(formats, read.getQuality(), range); count++; } reader.throwException(); reader.close(); return identifyFormatByHeuristic(formats, range[0], range[1]); } /** * Identify the fastq format used in a quality string file. * @param qualityString a string with quality data * @return The FastqFormat found or null if no format was found */ public static FastqFormat identifyFormat(final String qualityString) { if (qualityString == null) { return null; } final Set<FastqFormat> formats = newHashSet(Arrays.asList(FastqFormat.values())); final int[] range = new int[] {Integer.MAX_VALUE, Integer.MIN_VALUE}; removeBadFormats(formats, qualityString, range); return identifyFormatByHeuristic(formats, range[0], range[1]); } private static void removeBadFormats(final Set<FastqFormat> formats, final String qualityString, final int[] range) { Set<FastqFormat> toRemove = null; for (FastqFormat format : formats) { for (int i = 0; i < qualityString.length(); i++) { final int c = qualityString.codePointAt(i); // Check if the character is the lowest value if (c < range[0]) { range[0] = c; } // Check if the character is highest value if (c > range[1]) { range[1] = c; } if (c < format.getCharMin() || c > format.getCharMax()) { if (toRemove == null) { toRemove = new HashSet<>(); } toRemove.add(format); } } } if (toRemove != null) { formats.removeAll(toRemove); } } private static FastqFormat identifyFormatByHeuristic( final Set<FastqFormat> formats, final int lowerChar, final int higherChar) { if (formats == null) { return null; } if (formats.isEmpty()) { return null; } // Sort formats with increasing minimal char final List<FastqFormat> sortedFormats = newArrayList(formats); Collections.sort(sortedFormats, new Comparator<FastqFormat>() { @Override public int compare(final FastqFormat o1, final FastqFormat o2) { return Integer.valueOf(o1.getCharMin()) .compareTo((int) o2.getCharMin()); } }); FastqFormat last = null; // The format is the for (FastqFormat f : sortedFormats) { if (last != null && last.getCharMin() <= lowerChar && lowerChar < f.getCharMin()) { return f; } last = f; } // Check if the higher value is valid for the selected format if (higherChar <= last.getCharMax()) { return last; } return null; } @Override public String toString() { return getName(); } // // Constructor // /** * Constructor. * @param name format name * @param alias for the format name * @param illuminaVersion illumina version of the format * @param scoreMin quality score minimal value * @param scoreMax quality score maximal value * @param scoreMaxExpected quality score maximal expected value * @param asciiOffset ASCII offset * @param phredQualityScore Phred quality score */ FastqFormat(final String name, final String[] alias, final String illuminaVersion, final int scoreMin, final int scoreMax, final int scoreMaxExpected, final int asciiOffset, final boolean phredQualityScore) { this.name = name; this.alias = alias == null ? null : newHashSet(Arrays.asList(alias)); this.illuminaVersion = illuminaVersion; this.scoreMin = scoreMin; this.scoreMax = scoreMax; this.scoreMaxExpected = scoreMaxExpected; this.asciiOffset = asciiOffset; this.phredScore = phredQualityScore; } }