/*
* Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br)
* Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com)
*
* For further information check the LICENSE file.
*/
package bio.pih.genoogle.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.lang.ref.WeakReference;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.NoSuchElementException;
import org.apache.log4j.Logger;
import bio.pih.genoogle.index.IndexConstructionException;
import bio.pih.genoogle.index.ValueOutOfBoundsException;
import bio.pih.genoogle.io.proto.Io.StoredDatabank;
import bio.pih.genoogle.io.proto.Io.StoredSequence;
import bio.pih.genoogle.io.proto.Io.StoredSequenceInfo;
import bio.pih.genoogle.io.reader.IOTools;
import bio.pih.genoogle.io.reader.ParseException;
import bio.pih.genoogle.io.reader.RichSequenceStreamReader;
import bio.pih.genoogle.seq.Alphabet;
import bio.pih.genoogle.seq.IllegalSymbolException;
import bio.pih.genoogle.seq.RichSequence;
import bio.pih.genoogle.seq.SymbolList;
import com.google.protobuf.ByteString;
import com.google.protobuf.CodedInputStream;
/**
* Abstract class for Sequence Banks which stores sequences. This class has the low level IO
* methods.
*
* @author albrecht
*
*/
public abstract class AbstractSimpleSequenceDataBank extends AbstractSequenceDataBank {
private volatile int nextSequenceId;
protected StoredDatabank storedDatabank;
private File dataBankFile = null;
private File storedDataBankInfoFile = null;
private FileChannel indexFileChannel = null;
Logger logger = Logger.getLogger(AbstractSequenceDataBank.class.getCanonicalName());
public AbstractSimpleSequenceDataBank(String name, Alphabet alphabet, int subSequenceLength, File path,
AbstractDatabankCollection<? extends AbstractSimpleSequenceDataBank> parent) {
super(name, alphabet, subSequenceLength, path, parent);
this.nextSequenceId = 0;
this.storedDatabank = null;
}
@Override
public synchronized boolean load() throws IOException, ValueOutOfBoundsException {
logger.info("Loading databank '" + getDataBankFile() + "'.");
long begin = System.currentTimeMillis();
if (!getDataBankFile().exists() || !getStoredDataBankInfoFile().exists()) {
logger.fatal("Databank " + this.getName() + " is not encoded. Please encode it.");
return false;
}
CodedInputStream cis = CodedInputStream.newInstance(new FileInputStream(getStoredDataBankInfoFile()));
cis.setSizeLimit(0x7FFFFFFF);
this.storedDatabank = StoredDatabank.parseFrom(cis);
logger.info("Databank with : " + storedDatabank.getQtdSequences() + " sequences.");
logger.info("Databank with : " + storedDatabank.getQtdBases() + " bases.");
logger.info("Databank with : " + storedDatabank.getQtdBases() / getSubSequencesOffset() + " sub-sequences bases aprox.");
logger.info("Databank mask : " + storedDatabank.getMask());
logger.info("Databank low complexity filter: " + storedDatabank.getLowComplexityFilter());
this.numberOfSequences = storedDatabank.getQtdSequences();
this.dataBankSize = storedDatabank.getQtdBases();
logger.info("Databank loaded in " + (System.currentTimeMillis() - begin) + "ms with " + this.numberOfSequences
+ " sequences.");
return true;
}
public int getSubSequencesOffset() {
return subSequenceLength;
}
/**
* @param sequenceId
* @return {@link StoredSequence} of the given sequenceId.
*/
public synchronized StoredSequence getSequenceFromId(int sequenceId) throws IOException {
FileChannel channel = getIndexFileChannel();
StoredSequenceInfo storedSequenceInfo = storedDatabank.getSequencesInfo(sequenceId);
byte[] data = new byte[storedSequenceInfo.getLength()];
ByteBuffer buffer = ByteBuffer.wrap(data);
channel.read(buffer, storedSequenceInfo.getOffset());
return StoredSequence.parseFrom(data);
}
WeakReference<MappedByteBuffer> mappedIndexFile = new WeakReference<MappedByteBuffer>(null);
public void encodeSequences(boolean forceFormatting) throws IOException, NoSuchElementException,
ValueOutOfBoundsException, IndexConstructionException, ParseException, IllegalSymbolException {
if (getDataBankFile().exists()) {
throw new IOException("File " + getDataBankFile()
+ " already exists. Please remove it before creating another file.");
}
addFastaFile(getFullPath(), forceFormatting);
}
public synchronized void addFastaFile(File fastaFile, boolean forceFormatting) throws NoSuchElementException,
IOException, IndexConstructionException, ParseException, IllegalSymbolException {
logger.info("Adding a FASTA file from " + fastaFile);
long begin = System.currentTimeMillis();
FileChannel dataBankFileChannel = new FileOutputStream(getDataBankFile(), true).getChannel();
FileChannel storedSequenceInfoChannel = new FileOutputStream(getStoredDataBankInfoFile(), true).getChannel();
bio.pih.genoogle.io.proto.Io.StoredDatabank.Builder storedDatabankBuilder = StoredDatabank.newBuilder();
BufferedReader is = new BufferedReader(new FileReader(fastaFile));
RichSequenceStreamReader fastaFileStream = IOTools.readFasta(is, alphabet);
while (fastaFileStream.hasNext()) {
RichSequence s = null;
try {
s = fastaFileStream.nextRichSequence();
} catch (IllegalSymbolException e) {
if (forceFormatting) {
logger.info(e);
continue;
} else {
throw e;
}
}
StoredSequenceInfo[] info = addSequence(s, dataBankFileChannel);
for (int i = 0; i < info.length; i++) {
storedDatabankBuilder.addSequencesInfo(info[i]);
}
}
setStoredDatabankInfo(storedDatabankBuilder);
storedDatabank = storedDatabankBuilder.build();
storedSequenceInfoChannel.write(ByteBuffer.wrap(storedDatabank.toByteArray()));
storedSequenceInfoChannel.close();
dataBankFileChannel.close();
logger.info("FASTA file added in " + (System.currentTimeMillis() - begin) + "ms");
}
synchronized StoredSequenceInfo[] addSequence(RichSequence s, FileChannel dataBankFileChannel) throws IOException,
IndexConstructionException, IllegalSymbolException {
if (!s.getAlphabet().equals(this.alphabet)) {
logger.fatal("Invalid symbol in the sequence for sequence " + s.getName() + ". This sequence will be ignored.");
return new StoredSequenceInfo[] {};
}
if (s.getLength() < subSequenceLength) {
logger.info(s.getName() + "is too short (" + s.getLength() + ") for actual sub-sequence length (" + this.subSequenceLength + " ) and will not be stored in this data bank");
return new StoredSequenceInfo[] {};
}
long offset = dataBankFileChannel.position();
final byte[] ret = intArrayToByteArray(s);
int id = getNextSequenceId();
bio.pih.genoogle.io.proto.Io.StoredSequence.Builder builder = StoredSequence.newBuilder()
.setId(id).setGi(s.getGi())
.setName(s.getName())
.setType(s.getType())
.setAccession(s.getAccession())
.setDescription(s.getDescription())
.setEncodedSequence(ByteString.copyFrom(ret));
StoredSequence storedSequence = builder.build();
byte[] byteArray = storedSequence.toByteArray();
dataBankFileChannel.write(ByteBuffer.wrap(byteArray));
int[] encodedSequence = Utils.getEncodedSequenceAsArray(storedSequence);
doSequenceProcessing(numberOfSequences, encodedSequence);
this.numberOfSequences++;
this.dataBankSize += s.getLength();
StoredSequenceInfo info = StoredSequenceInfo.newBuilder().setId(id).setOffset(offset).setLength(byteArray.length).build();
return new StoredSequenceInfo[] {info};
}
protected byte[] intArrayToByteArray(SymbolList s) {
int[] encoded = encoder.encodeSymbolListToIntegerArray(s);
ByteBuffer byteBuf = ByteBuffer.allocate(encoded.length * 4);
for (int i = 0; i < encoded.length; i++) {
byteBuf.putInt(encoded[i]);
}
return byteBuf.array();
}
abstract public int doSequenceProcessing(int sequenceId, int[] encodedSequence)
throws IndexConstructionException, IllegalSymbolException;
protected static void checkFile(File file, boolean readOnly) throws IOException {
if (file.exists()) {
if (!file.canRead()) {
throw new IOException("File " + file.getCanonicalPath() + " exists but is not readable");
}
if (!readOnly & !file.canWrite()) {
throw new IOException("File " + file.getCanonicalPath() + " exists but is not writable");
}
} else if (readOnly) {
throw new IOException("File " + file.getCanonicalPath()
+ " does not exist and can not be marked as read-only");
}
}
protected synchronized int getNextSequenceId() {
int id = nextSequenceId;
nextSequenceId++;
return id;
}
public synchronized int getNumberOfSequences() {
return numberOfSequences;
}
protected synchronized File getDataBankFile() {
if (dataBankFile == null) {
dataBankFile = new File(getFullPath() + ".dsdb");
}
return dataBankFile;
}
protected synchronized File getStoredDataBankInfoFile() {
if (storedDataBankInfoFile == null) {
storedDataBankInfoFile = new File(getFullPath() + ".ssdb");
}
return storedDataBankInfoFile;
}
private FileChannel getIndexFileChannel() throws IOException {
if (indexFileChannel == null) {
indexFileChannel = new RandomAccessFile(getDataBankFile(), "r").getChannel();
}
return indexFileChannel;
}
@Override
public String toString() {
return this.name + "@" + this.getFullPath();
}
public boolean check() {
if (getDataBankFile().exists() && getStoredDataBankInfoFile().exists()) {
return true;
}
return false;
}
@Override
public void delete() {
if (getDataBankFile().exists()) {
boolean delete = getDataBankFile().delete();
if (!delete) {
logger.error(getDataBankFile() + " can not be deleted.");
}
}
if (getStoredDataBankInfoFile().exists()) {
boolean delete = getStoredDataBankInfoFile().delete();
if (!delete) {
logger.error(getStoredDataBankInfoFile() + " can not be deleted.");
}
}
}
public long getDataBaseSize() {
return dataBankSize;
}
@Override
public long getTotalDataBaseSize() {
if (parent == null) {
return getDataBaseSize();
}
return parent.getTotalDataBaseSize();
}
@Override
public long getTotalNumberOfSequences() {
if (parent == null) {
return getNumberOfSequences();
}
return parent.getTotalNumberOfSequences();
}
}