/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.bio;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.util.Utils.checkNotNull;
import static fr.ens.biologie.genomique.eoulsan.util.Utils.newArrayList;
import static java.util.Arrays.asList;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.bio.io.FastaLineParser;
import fr.ens.biologie.genomique.eoulsan.util.FileUtils;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
/**
* This class define a genome description.
* @since 1.0
* @author Laurent Jourdren
*/
public class GenomeDescription {
private static final String PREFIX = "genome.";
private static final String NAME_PREFIX = PREFIX + "name";
private static final String LENGTH_PREFIX = PREFIX + "length";
private static final String MD5_PREFIX = PREFIX + "md5";
private static final String SEQUENCE_PREFIX = PREFIX + "sequence.";
private static final String SEQUENCES_COUNT_PREFIX = PREFIX + "sequences";
private String genomeName;
private final Map<String, Long> sequences = new LinkedHashMap<>();
private String md5Sum;
//
// Setters
//
/**
* Set the genome name.
* @param genomeName name of the genome
*/
private void setGenomeName(final String genomeName) {
this.genomeName = genomeName;
}
/**
* Add a sequence.
* @param sequenceName name of the sequence
* @param sequenceLength length of the sequence
*/
public void addSequence(final String sequenceName,
final long sequenceLength) {
getLogger().fine("Add sequence in genome description: "
+ sequenceName + " with " + sequenceLength + " pb");
this.sequences.put(sequenceName, sequenceLength);
}
/**
* Set the md5 digest of the genome file
* @param md5Digest the md5 digest
*/
public void setMD5Sum(final String md5Digest) {
this.md5Sum = md5Digest;
}
//
// Getters
//
/**
* Get the genome name.
* @return the genome name
*/
public String getGenomeName() {
return this.genomeName;
}
/**
* Get the length of a sequence
* @param sequenceName name of the sequence
* @return the length of the sequence or -1 if the sequence does not exists
*/
public long getSequenceLength(final String sequenceName) {
if (this.sequences.containsKey(sequenceName)) {
return this.sequences.get(sequenceName);
}
return -1;
}
/**
* Test if the genome description contains a sequence
* @param sequenceName name of the sequence
* @return true if the sequence exists in the genome description
*/
public boolean containsSequence(final String sequenceName) {
return this.sequences.containsKey(sequenceName);
}
/**
* Get the names of the sequences.
* @return a set with the name of the sequence
*/
public List<String> getSequencesNames() {
return Collections.unmodifiableList(newArrayList(this.sequences.keySet()));
}
/**
* Get the md5 sum for the genome.
* @return the md5 sum
*/
public String getMD5Sum() {
return this.md5Sum;
}
/**
* Get the number of sequences in the genome.
* @return the number of sequences in the genome
*/
public int getSequenceCount() {
return this.sequences.size();
}
/**
* Get the genome length;
* @return the genome length
*/
public long getGenomeLength() {
long count = 0;
for (Map.Entry<String, Long> e : this.sequences.entrySet()) {
count += e.getValue();
}
return count;
}
//
// Save description
//
/**
* Save genome description.
* @param os OutputStream to use for genome description writing
*/
public void save(final OutputStream os) throws IOException {
checkNotNull(os, "OutputStream is null");
final Writer writer = FileUtils.createFastBufferedWriter(os);
if (this.genomeName != null) {
writer.write(NAME_PREFIX + "=" + getGenomeName() + '\n');
}
if (this.md5Sum != null) {
writer.write(MD5_PREFIX + "=" + getMD5Sum() + '\n');
}
writer.write(SEQUENCES_COUNT_PREFIX + '=' + getSequenceCount() + '\n');
writer.write(LENGTH_PREFIX + '=' + getGenomeLength() + '\n');
for (String seqName : getSequencesNames()) {
writer.write(
SEQUENCE_PREFIX + seqName + "=" + getSequenceLength(seqName) + "\n");
}
writer.close();
}
/**
* Save genome description.
* @param file output file
*/
public void save(final File file) throws FileNotFoundException, IOException {
checkNotNull(file, "File is null");
save(FileUtils.createOutputStream(file));
}
//
// Load description
//
/**
* Load genome description.
* @param is InputStream to use
*/
public static GenomeDescription load(final InputStream is)
throws IOException {
checkNotNull(is, "InputStream is null");
final GenomeDescription result = new GenomeDescription();
final BufferedReader read = FileUtils.createBufferedReader(is);
String line = null;
while ((line = read.readLine()) != null) {
final List<String> fields = asList(line.split("="));
if (fields.size() > 1) {
final String key = fields.get(0).trim();
if (key.startsWith(NAME_PREFIX)) {
result.setGenomeName(fields.get(1));
}
if (key.startsWith(MD5_PREFIX)) {
result.setMD5Sum(fields.get(1));
} else {
try {
if (key.startsWith(SEQUENCE_PREFIX)) {
result.addSequence(key.substring(SEQUENCE_PREFIX.length()),
Integer.parseInt(fields.get(1)));
}
} catch (NumberFormatException e) {
}
}
}
}
is.close();
return result;
}
/**
* Load genome description.
* @param file File to use
*/
public static GenomeDescription load(final File file) throws IOException {
checkNotNull(file, "File is null");
return load(FileUtils.createInputStream(file));
}
//
// Static methods
//
/**
* Create a GenomeDescription object from a Fasta file.
* @param genomeFastaFile genome fasta file
*/
public static GenomeDescription createGenomeDescFromFasta(
final File genomeFastaFile) throws BadBioEntryException, IOException {
checkNotNull(genomeFastaFile, "The genome file is null");
return createGenomeDescFromFasta(
FileUtils.createInputStream(genomeFastaFile),
genomeFastaFile.getName());
}
/**
* Create a GenomeDescription object from a Fasta file.
* @param genomeFastaIs genome fasta input stream
* @param filename name of the file of the input stream
*/
public static GenomeDescription createGenomeDescFromFasta(
final InputStream genomeFastaIs, final String filename)
throws BadBioEntryException, IOException {
return createGenomeDesc(genomeFastaIs, filename, false);
}
/**
* Create a GenomeDescription object from a GFF file.
* @param gffFile genome in GFF file
*/
public static GenomeDescription createGenomeDescFromGFF(final File gffFile)
throws BadBioEntryException, IOException {
checkNotNull(gffFile, "The genome file is null");
return createGenomeDescFromGFF(FileUtils.createInputStream(gffFile),
gffFile.getName());
}
/**
* Create a GenomeDescription object from a GFF file.
* @param gffFile genome in GFF input stream
* @param filename name of the file of the input stream
*/
public static GenomeDescription createGenomeDescFromGFF(
final InputStream gffFile, final String filename)
throws BadBioEntryException, IOException {
return createGenomeDesc(gffFile, filename, true);
}
/**
* Create a GenomeDescription object from a Fasta file of GFF file.
* @param genomeFastaIs genome fasta input stream
* @param filename name of the file of the input stream
* @param gffFormat the input file is in GFF format
*/
public static GenomeDescription createGenomeDesc(
final InputStream genomeFastaIs, final String filename,
final boolean gffFormat) throws BadBioEntryException, IOException {
checkNotNull(genomeFastaIs, "The input stream of the genome is null");
getLogger().fine("Compute genome description from genome fasta file.");
final GenomeDescription result = new GenomeDescription();
result.setGenomeName(StringUtils.basename(filename));
MessageDigest md5Digest;
try {
md5Digest = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
md5Digest = null;
}
final FastaLineParser parser =
new FastaLineParser(genomeFastaIs, gffFormat);
final Alphabet alphabet = Alphabets.AMBIGUOUS_DNA_ALPHABET;
String seqName = null;
String lastSeqName = null;
String parsedSeqName = null;
long chrSize = 0;
while ((seqName = parser.parseNextLineAndGetSequenceName()) != null) {
if (!seqName.equals(lastSeqName)) {
// Check if sequence has been found more than one time
if (result.getSequenceLength(lastSeqName) != -1) {
throw new BadBioEntryException(
"Sequence name found twice: " + lastSeqName, lastSeqName);
}
// Add sequence
if (lastSeqName != null) {
result.addSequence(parsedSeqName, chrSize);
}
// Parse chromosome name
parsedSeqName = parseChromosomeName(seqName);
if (parsedSeqName == null) {
throw new IOException("No fasta header found.");
}
// Update digest with chromosome name
if (md5Digest != null) {
md5Digest.update(parsedSeqName.getBytes(Globals.DEFAULT_CHARSET));
}
lastSeqName = seqName;
chrSize = 0;
}
final String sequence = parser.getSequence();
if (sequence == null) {
throw new IOException("No fasta sequence found.");
}
// Check the sequence and increment the length of the sequence
chrSize += checkBases(sequence, lastSeqName, alphabet);
// Update digest with chromosome sequence
md5Digest.update(sequence.getBytes(Globals.DEFAULT_CHARSET));
}
// Add the last sequence
if (lastSeqName != null) {
result.addSequence(parsedSeqName, chrSize);
}
// Compute final MD5 sum
if (md5Digest != null) {
result.setMD5Sum(digestToString(md5Digest));
}
genomeFastaIs.close();
return result;
}
private static String parseChromosomeName(final String fastaHeader)
throws BadBioEntryException {
if (fastaHeader == null) {
return null;
}
if ("".equals(fastaHeader.trim())) {
throw new BadBioEntryException("Sequence header is empty",
">" + fastaHeader);
}
if (fastaHeader.startsWith(" ")) {
throw new BadBioEntryException(
"A whitespace was found at the beginning of the sequence name",
">" + fastaHeader);
}
final String s = fastaHeader.trim();
String[] fields = s.split("\\s");
if (fields == null || fields.length == 0) {
throw new BadBioEntryException("Invalid sequence header",
">" + fastaHeader);
}
return fields[0];
}
private static long checkBases(final String sequence,
final String sequenceName, final Alphabet alphabet)
throws BadBioEntryException {
final char[] array = sequence.toCharArray();
for (final char c : array) {
if (!alphabet.isLetterValid(c)) {
throw new BadBioEntryException("Invalid base in genome: " + c,
sequenceName);
}
}
return sequence.length();
}
private static String digestToString(final MessageDigest md) {
if (md == null) {
return null;
}
final BigInteger bigInt = new BigInteger(1, md.digest());
return bigInt.toString(16);
}
//
// Other methods
//
@Override
public String toString() {
return this.getClass().getSimpleName()
+ "{genomeName=" + this.genomeName + ", sequencesCount="
+ this.sequences.size() + ", md5Sum=" + this.md5Sum + ", sequences="
+ this.sequences + "}";
}
}