// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.read_sort;
import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.UnknownItemException;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.util.regex.*;
import org.apache.commons.codec.binary.Hex;
public class FastaChecksummer implements Iterable<FastaChecksummer.ChecksumEntry>
{
public static class ChecksumEntry
{
private String name;
private String checksum;
public ChecksumEntry(String name, String checksum)
{
this.name = name;
this.checksum = checksum;
}
public String getName() { return name; }
public String getChecksum() { return checksum; }
}
private BufferedReader input;
private HashMap<String, ChecksumEntry> contigHashes;
private final String checksumAlgorithm = "MD5";
private static final Pattern ContigNamePattern = Pattern.compile(">\\s*(\\S+).*");
public void setInput(Reader stream)
{
input = new BufferedReader(stream, 4*1024*1024);
contigHashes = null;
}
public void calculate() throws FormatException, java.io.IOException
{
if (input == null)
throw new IllegalStateException("FastaChecksummer input not set");
contigHashes = new HashMap<String, ChecksumEntry>();
String currentContig = null;
java.security.MessageDigest hasher = null;
try {
hasher = java.security.MessageDigest.getInstance(checksumAlgorithm);
}
catch (java.security.NoSuchAlgorithmException e) {
throw new RuntimeException("Unexpected NoSuchAlgorithmException when asking for " + checksumAlgorithm + " algorithm");
}
String line = input.readLine();
if (line == null)
throw new FormatException("empty Fasta");
try
{
while (line != null)
{
if (line.startsWith(">")) // start a new contig
{
if (currentContig != null)
{
// Hadoop 0.20,2 ships with Apache commons version 1.3, which doesn't
// have encodeHexString
String cs = new String(Hex.encodeHex(hasher.digest()));
contigHashes.put(currentContig, new ChecksumEntry(currentContig, cs));
}
Matcher m = ContigNamePattern.matcher(line);
if (m.matches())
{
currentContig = m.group(1);
hasher.reset();
}
else
throw new FormatException("Unexpected contig name format: " + line);
}
else
{
if (currentContig == null)
throw new FormatException("Sequence outside any fasta record (header is missing). Line: " + line);
else
hasher.update( line.getBytes("US-ASCII") );
}
line = input.readLine();
}
if (currentContig != null) // store the last contig
{
String cs = new String(Hex.encodeHex(hasher.digest()));
contigHashes.put(currentContig, new ChecksumEntry(currentContig, cs));
}
}
catch (java.io.UnsupportedEncodingException e) {
throw new RuntimeException("Unexpected UnsupportedEncodingException! Line: " + line);
}
}
public Iterator<ChecksumEntry> iterator()
{
if (contigHashes == null)
throw new IllegalStateException("Checksums not calculated");
return contigHashes.values().iterator();
}
public boolean hasChecksum(String contigName)
{
if (contigHashes == null)
throw new IllegalStateException("Checksums not calculated");
return contigHashes.containsKey(contigName);
}
public String getChecksum(String contigName) throws UnknownItemException
{
if (contigHashes == null)
throw new IllegalStateException("Checksums not calculated");
ChecksumEntry entry = contigHashes.get(contigName);
if (entry == null)
throw new UnknownItemException("Unknown contig name " + contigName);
else
return entry.getChecksum();
}
}