package net.sf.cram;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.cram.build.CramIO;
import htsjdk.samtools.cram.structure.CramHeader;
import htsjdk.samtools.util.Log;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import net.sf.cram.CramTools.LevelConverter;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
public class DownloadReferences {
private static Log log = Log.getInstance(DownloadReferences.class);
public static final String COMMAND = "getref";
private static void printUsage(JCommander jc) {
StringBuilder sb = new StringBuilder();
sb.append("\n");
jc.usage(sb);
System.out.println("Version " + DownloadReferences.class.getPackage().getImplementationVersion());
System.out.println(sb.toString());
}
public static void main(String[] args) throws IOException {
Params params = new Params();
JCommander jc = new JCommander(params);
try {
jc.parse(args);
} catch (Exception e) {
System.out.println("Failed to parse parameteres, detailed message below: ");
System.out.println(e.getMessage());
System.out.println();
System.out.println("See usage: -h");
System.exit(1);
}
if (args.length == 0 || params.help) {
printUsage(jc);
System.exit(1);
}
Log.setGlobalLogLevel(params.logLevel);
List<SeqID> idList = new ArrayList<SeqID>();
if (params.md5List != null) {
for (String id : params.md5List)
idList.add(new SeqID(id));
}
if (params.cramFile == null) {
log.error("Expecting a file.");
System.exit(1);
}
if (!params.cramFile.exists()) {
log.error("File does not exist: " + params.cramFile.getAbsolutePath());
System.exit(1);
}
if (!params.cramFile.canRead()) {
log.error("Cannot read file: " + params.cramFile.getAbsolutePath());
System.exit(1);
}
SAMFileHeader samFileHeader = null;
try {
CramHeader cramHeader = CramIO.readCramHeader(new FileInputStream(params.cramFile));
samFileHeader = cramHeader.getSamFileHeader();
} catch (Exception e) {
// try to open using standard way, perhaps this is a valid file
// but not in CRAM format:
log.info("Not a cram file, trying other formats...");
try {
SamReader samReader = SamReaderFactory.make().open(params.cramFile);
samFileHeader = samReader.getFileHeader();
} catch (Exception e1) {
log.error("Failed to open file: unknown file format.");
System.exit(1);
}
}
for (SAMSequenceRecord s : samFileHeader.getSequenceDictionary().getSequences()) {
String md5 = s.getAttribute(SAMSequenceRecord.MD5_TAG);
if (md5 != null)
idList.add(new SeqID(s.getSequenceName(), md5));
}
if (idList.size() < samFileHeader.getSequenceDictionary().size()) {
log.warn(String.format("File header contains %d sequences but only %d have md5 checksums.", samFileHeader
.getSequenceDictionary().size(), idList.size()));
}
OutputStream os = (params.destFile != null ? os = openOptionallyGzippedFile(params.destFile.getAbsolutePath(),
params.gzip) : System.out);
downloadSequences(os, idList, params.ignoreNotFound, params.lineLength);
if (os != System.out)
os.close();
}
private static class SeqID {
private static final int MD5_LEN = 32;
String name;
String md5;
/**
* ID is of form "name:md5".
*
* @param id
*/
public SeqID(String id) {
md5 = id.substring(id.length() - MD5_LEN);
name = id;
if (id.length() > MD5_LEN + 1)
name = id.substring(0, id.length() - MD5_LEN - 1);
if (name == null || name.length() == 0)
name = md5;
}
public SeqID(String name, String md5) {
this.name = name;
this.md5 = md5;
}
}
private static void downloadSequences(OutputStream bos, List<SeqID> idList, boolean ignoreNotFound, int lineLength) {
try {
for (SeqID id : idList) {
log.info(String.format("Locating sequence %s for MD5 %s", id.name, id.md5));
InputStream is = null;
try {
is = getInputStreamForMD5(id.md5);
} catch (IOException ioe) {
String message = ioe.getMessage();
if (message != null & message.startsWith("Server returned HTTP response code: 500")) {
if (ignoreNotFound) {
log.warn(String.format("Not found in the remote repository: sequence '%s' for MD5 %s",
id.name, id.md5));
continue;
} else {
log.error(String.format("Not found in the remote repository: sequence '%s' for MD5 %s",
id.name, id.md5));
throw ioe;
}
} else
throw ioe;
}
printSequence(is, id, bos, lineLength);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static void printSequence(InputStream is, SeqID id, OutputStream bos, int lineLength) throws IOException {
bos.write('>');
bos.write(id.name.getBytes());
bos.write(' ');
bos.write(id.md5.getBytes());
bos.write('\n');
copy(is, bos, lineLength);
}
private static OutputStream openOptionallyGzippedFile(String name, boolean gzip) throws IOException {
FileOutputStream fos = new FileOutputStream(name + (gzip ? ".gz" : ""));
OutputStream bos = new BufferedOutputStream(fos);
if (gzip)
bos = new GZIPOutputStream(bos);
return bos;
}
private static void copy(InputStream in, OutputStream out, int lineLength) throws IOException {
int count;
byte[] buffer = new byte[8192];
int posInLine = 0, posInBuf = 0;
while ((count = in.read(buffer)) > -1) {
posInBuf = 0;
while (posInBuf < count) {
for (; posInLine < lineLength && posInBuf < count; posInLine++) {
out.write(buffer[posInBuf++]);
}
if (posInLine >= lineLength) {
posInLine = 0;
out.write('\n');
}
}
}
if (posInLine > 0)
out.write('\n');
}
private static InputStream getInputStreamForMD5(String md5) throws IOException {
String urlString = String.format("http://www.ebi.ac.uk/ena/cram/md5/%s", md5);
URL url = new URL(urlString);
return url.openStream();
}
@Parameters(commandDescription = "Download reference sequences.")
static class Params {
@Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class)
Log.LogLevel logLevel = Log.LogLevel.ERROR;
@Parameter(names = { "-h", "--help" }, description = "Print help and quit")
boolean help = false;
@Parameter(names = { "--input-file", "-I" }, converter = FileConverter.class, description = "The path to the CRAM or BAM file to extract sequence MD5 checksums.")
File cramFile;
@Parameter(names = { "--gzip", "-z" }, description = "Compress fasta with gzip.")
boolean gzip;
@Parameter(description = "A list of MD5 checksums for which the sequences should be downloaded.")
List<String> md5List;
@Parameter(names = { "--destination-file", "-F" }, description = "Destination file.")
File destFile;
@Parameter(names = { "--ignore-not-found" }, description = "Don't fail on not found sequences, just issue a warning.")
boolean ignoreNotFound = false;
@Parameter(names = { "--fasta-line-length" }, description = "Wrap fasta lines accroding to this value.")
public int lineLength = 80;
}
}