package net.sf.cram.fasta;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Scanner;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.Log;
import net.sf.cram.AlignmentSliceQuery;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
public class BGZF_ReferenceSequenceFile implements ReferenceSequenceFile {
private static Log log = Log.getInstance(BGZF_ReferenceSequenceFile.class);
private LinkedHashMap<String, FAIDX_FastaIndexEntry> index = new LinkedHashMap<String, FAIDX_FastaIndexEntry>();
private Iterator<String> iterator;
private BlockCompressedInputStream is;
private SAMSequenceDictionary dictionary;
public BGZF_ReferenceSequenceFile(File file) throws FileNotFoundException {
if (!file.canRead())
throw new RuntimeException("Cannot find or read fasta file: " + file.getAbsolutePath());
File indexFile = new File(file.getAbsolutePath() + ".fai");
if (!indexFile.canRead())
throw new RuntimeException("Cannot find or read fasta index file: " + indexFile.getAbsolutePath());
Scanner scanner = new Scanner(indexFile);
int seqID = 0;
dictionary = new SAMSequenceDictionary();
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
FAIDX_FastaIndexEntry entry = FAIDX_FastaIndexEntry.fromString(seqID++, line);
index.put(entry.getName(), entry);
dictionary.addSequence(new SAMSequenceRecord(entry.getName(), entry.getLen()));
}
scanner.close();
if (index.isEmpty())
log.warn("No entries in the index: " + indexFile.getAbsolutePath());
is = new BlockCompressedInputStream(new SeekableFileStream(file));
}
public List<String> lookUpByRegex(String regex) {
List<String> list = new ArrayList<String>();
for (String name : index.keySet())
if (name.matches(regex))
list.add(name);
return list;
}
@Override
public SAMSequenceDictionary getSequenceDictionary() {
return dictionary;
}
@Override
public ReferenceSequence nextSequence() {
if (iterator == null)
iterator = index.keySet().iterator();
if (!iterator.hasNext())
return null;
String name = iterator.next();
try {
return findSequence(name, 0, 0);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void reset() {
iterator = null;
}
@Override
public boolean isIndexed() {
return true;
}
@Override
public ReferenceSequence getSequence(String contig) {
try {
return findSequence(contig, 0, 0);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public ReferenceSequence getSubsequenceAt(String contig, long start, long stop) {
try {
return findSequence(contig, start, stop);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
}
private ReferenceSequence findSequence(String name, long start, long stop) throws IOException {
if (!index.containsKey(name))
return null;
FAIDX_FastaIndexEntry entry = index.get(name);
if (start < 1)
start = 1;
if (stop < 1)
stop = start + entry.getLen() - 1;
if (stop < start)
throw new RuntimeException("Invalid sequence boundaries.");
int len = (int) (stop - start) + 1;
len = Math.min(entry.getLen(), len);
is.seek(entry.getStartPointer());
int lineBreakLen = entry.getBytesPerLine() - entry.getBasesPerLine();
{ // calculate how many bytes to skip from the beginning of the sequence
// in the file:
long skip = start + (start / entry.getBasesPerLine()) * lineBreakLen;
// System.out.println("skip=" + skip);
is.skip(skip - 1);
}
byte[] data = new byte[len];
int bufPos = 0;
byte b;
while ((b = (byte) is.read()) != -1 && bufPos < len && b != '\r' && b != '\n')
data[bufPos++] = b;
// skip the rest of "new line" bytes:
for (int i = 1; i < entry.getBytesPerLine() - entry.getBasesPerLine(); i++)
is.read();
// read complete lines:
int completeLinesToRead = (len - bufPos) / entry.getBasesPerLine();
for (int line = 0; line < completeLinesToRead; line++) {
InputStreamUtils.readFully(is, data, bufPos, entry.getBasesPerLine());
bufPos += entry.getBasesPerLine();
is.skip(lineBreakLen);
}
// read the rest of the last incomplete line:
while ((b = (byte) is.read()) != -1 && bufPos < len && b != '\r' && b != '\n')
data[bufPos++] = b;
return new ReferenceSequence(entry.getName(), entry.getIndex(), data);
}
public static void main(String[] args) throws FileNotFoundException {
Params params = new Params();
JCommander jc = new JCommander(params);
jc.setProgramName("bquery");
try {
jc.parse(args);
} catch (Exception e) {
jc.usage();
return;
}
if (params.file == null) {
jc.usage();
return;
}
BGZF_ReferenceSequenceFile rsf = new BGZF_ReferenceSequenceFile(params.file);
for (String stringQuery : params.queries) {
AlignmentSliceQuery q = new AlignmentSliceQuery(stringQuery);
if (!params.strictSeqNameMatch) {
List<String> list = lookup(q.sequence, rsf);
if (list.isEmpty()) {
log.warn("Sequence not found: " + q.sequence);
continue;
}
if (params.listMatches) {
for (String seq : list)
System.out.println(seq);
continue;
}
if (list.size() > 1)
q.sequence = chooseOne(q.sequence, list);
}
ReferenceSequence seq = rsf.getSubsequenceAt(q.sequence, q.start, q.end);
if (seq == null)
System.err.println("Nothing found for: " + stringQuery);
else
System.out.println(new String(seq.getBases()));
}
}
private static List<String> lookup(String name, BGZF_ReferenceSequenceFile rsf) {
return rsf.lookUpByRegex("\\b" + name + ".*\\b");
}
private static String chooseOne(String query, List<String> list) {
// grab the one that starts with the query or the first from
// the list:
String best = null;
boolean foundCandidateWhichStartsWithQuery = false;
for (String candidate : list) {
if (candidate.startsWith(query)) {
best = candidate;
foundCandidateWhichStartsWithQuery = true;
break;
}
}
if (!foundCandidateWhichStartsWithQuery)
best = list.get(0);
log.warn(String.format("Assuming '%s' means '%s'", query, best));
return best;
}
@Parameters(commandDescription = "BGZF fasta utility")
static class Params {
@Parameter(names = { "-I" }, description = "Block compressed fasta file.", converter = FileConverter.class)
File file;
@Parameter(description = "Regions to fetch, each region should follow the rule: <seq name>[:<start>][-<stop>]]")
List<String> queries;
@Parameter(names = { "--strict" }, description = "Match sequence names exactly as in the queries. ")
boolean strictSeqNameMatch = false;
@Parameter(names = { "--list-matches" }, description = "Print out a list of matching names.")
boolean listMatches = false;
}
}