package net.sf.cram.fasta;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.util.List;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.Log;
class BGZF_FastaIndexer {
private static Log log = Log.getInstance(BGZF_FastaIndexer.class);
private BlockCompressedInputStream is;
private long start;
private int len;
private int lineWidthNoNL, lineWidthWithNL;
private ByteBuffer lineBuf = ByteBuffer.allocate(1024);
private long lineCounter = 0;
private int sequenceCounter = 0;
private boolean hasNextNameInBuf = false;
public BGZF_FastaIndexer(BlockCompressedInputStream is) {
this.is = is;
}
private boolean readLine() throws IOException {
lineCounter++;
int ch = is.read();
if (ch == -1)
return false;
lineBuf.clear();
lineBuf.put((byte) (0xFF & ch));
while ((ch = is.read()) != -1) {
lineBuf.put((byte) (0xFF & ch));
if (!lineBuf.hasRemaining())
reallocate();
if (ch == '\n')
break;
}
if (ch == -1)
throw new EOFException();
lineBuf.flip();
return true;
}
private void reallocate() {
int newSize = Math.min(2 * lineBuf.capacity(), Integer.MAX_VALUE);
if (newSize <= lineBuf.capacity())
throw new RuntimeException("Can't handle lines longer than 2gb.");
log.info("Reallocating line buffer to new size: " + newSize);
int pos = lineBuf.position();
byte[] newArray = new byte[newSize];
System.arraycopy(lineBuf.array(), 0, newArray, 0, lineBuf.limit());
lineBuf = ByteBuffer.wrap(newArray);
lineBuf.position(pos);
lineBuf.limit(newSize);
}
private int trimmedLength() {
int len = lineBuf.limit();
for (int i = lineBuf.limit() - 1; i >= 0; i--) {
switch (lineBuf.get(i)) {
case '\r':
case '\n':
len--;
break;
default:
break;
}
}
return len;
}
private void readSeq() throws IOException {
len = 0;
lineWidthNoNL = 0;
lineWidthWithNL = 0;
start = is.getFilePointer();
hasNextNameInBuf = false;
while (readLine()) {
if (lineBuf.get(0) == '>') {
hasNextNameInBuf = true;
break;
}
lineWidthWithNL = Math.max(lineWidthWithNL, lineBuf.limit());
int trimmedLength = trimmedLength();
lineWidthNoNL = Math.max(lineWidthNoNL, trimmedLength);
len += trimmedLength;
}
if (len == 0)
throw new RuntimeException("Invalid format: no sequence line.");
}
public FAIDX_FastaIndexEntry readNext() throws IOException {
if (!hasNextNameInBuf)
if (!readLine())
return null;
if (lineBuf.limit() == 0)
throw new RuntimeException("Invalid format: empty line.");
if (lineBuf.get(0) != '>') {
throw new RuntimeException("Invalid format: sequence name expected to start with '>' at line "
+ lineCounter);
}
byte[] nameBytes = new byte[trimmedLength() - 1];
lineBuf.limit(nameBytes.length + 1);
lineBuf.get();
lineBuf.get(nameBytes);
readSeq();
String name = new String(nameBytes).split(" ")[0];
return new FAIDX_FastaIndexEntry(sequenceCounter++, name, len, start, lineWidthNoNL, lineWidthWithNL);
}
public static void main(String[] args) throws IOException {
Params params = new Params();
JCommander jc = new JCommander(params);
jc.parse(args);
for (File file : params.files) {
log.info("Indexing file: " + file.getAbsolutePath());
BlockCompressedInputStream bcis = new BlockCompressedInputStream(new SeekableFileStream(file));
bcis.available();
BGZF_FastaIndexer mli = new BGZF_FastaIndexer(bcis);
PrintWriter writer = new PrintWriter(file.getAbsolutePath() + ".fai");
FAIDX_FastaIndexEntry e;
while (!writer.checkError() && (e = mli.readNext()) != null)
writer.println(e);
writer.close();
}
}
@Parameters
static class Params {
@Parameter
List<File> files;
}
}